mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-24 13:50:37 +00:00
Compare commits
202 Commits
check_repl
...
release-37
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
babefdd3f9 | ||
|
|
805fee1483 | ||
|
|
85d6d9dc85 | ||
|
|
e40ee7c3d1 | ||
|
|
0fe3b3646a | ||
|
|
529f8b5016 | ||
|
|
fbcd174489 | ||
|
|
7b5489a0bb | ||
|
|
40268dcd8d | ||
|
|
4436c84751 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
b758bf47ca | ||
|
|
024e306f73 | ||
|
|
f71c82e5de | ||
|
|
faf070f288 | ||
|
|
8c13296add | ||
|
|
18537be298 | ||
|
|
3128eeff01 | ||
|
|
227c87e333 | ||
|
|
e8f9aaf78c | ||
|
|
fa74d5649e | ||
|
|
f70871dfd0 | ||
|
|
99a1be6c4e | ||
|
|
76aa01c90f | ||
|
|
3e2f0ffb11 | ||
|
|
d597e6d42b | ||
|
|
71ccb07a43 | ||
|
|
ad8d777c1c | ||
|
|
2f97b43315 | ||
|
|
533a92636c | ||
|
|
bf303a6575 | ||
|
|
8cd20485f8 | ||
|
|
933a869f00 | ||
|
|
8c6541fea9 | ||
|
|
5cf75d92d8 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
0b001a0001 | ||
|
|
4a8bd866f6 | ||
|
|
615a490239 | ||
|
|
b95addddd5 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
118
.github/workflows/build_and_test.yml
vendored
118
.github/workflows/build_and_test.yml
vendored
@@ -737,34 +737,6 @@ jobs:
|
||||
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
--cleanup
|
||||
|
||||
# Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
|
||||
# During the transition period we need to have extensions in both places (in S3 and in compute-node image),
|
||||
# so we won't build extension twice, but extract them from compute-node.
|
||||
#
|
||||
# For now we use extensions image only for new custom extensitons
|
||||
- name: Kaniko build extensions only
|
||||
run: |
|
||||
# Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
|
||||
# Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
|
||||
# it still fails with error:
|
||||
# error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
|
||||
#
|
||||
# Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
|
||||
find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
|
||||
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--context . \
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
|
||||
--build-arg PG_VERSION=${{ matrix.version }} \
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
|
||||
--dockerfile Dockerfile.compute-node \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||
--destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||
--cleanup \
|
||||
--target postgres-extensions
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
@@ -780,7 +752,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.16.2
|
||||
VM_BUILDER_VERSION: v0.17.5
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -803,8 +775,7 @@ jobs:
|
||||
run: |
|
||||
./vm-builder \
|
||||
-enable-file-cache \
|
||||
-enable-monitor \
|
||||
-enable-informant \
|
||||
-cgroup-uid=postgres \
|
||||
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
@@ -887,10 +858,8 @@ jobs:
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Push images to production ECR
|
||||
if: |
|
||||
@@ -901,10 +870,8 @@ jobs:
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest
|
||||
|
||||
- name: Configure Docker Hub login
|
||||
run: |
|
||||
@@ -926,65 +893,56 @@ jobs:
|
||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
upload-postgres-extensions-to-s3:
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
|
||||
needs: [ tag, promote-images ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15 ]
|
||||
|
||||
env:
|
||||
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
|
||||
S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
|
||||
|
||||
build-private-extensions:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
needs: [ tag ]
|
||||
steps:
|
||||
- name: Pull postgres-extensions image
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
run: |
|
||||
docker pull ${EXTENSIONS_IMAGE}
|
||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||
REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"
|
||||
|
||||
- name: Create postgres-extensions container
|
||||
id: create-container
|
||||
run: |
|
||||
EID=$(docker create ${EXTENSIONS_IMAGE} true)
|
||||
echo "EID=${EID}" >> $GITHUB_OUTPUT
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"pending\",
|
||||
\"context\": \"build-and-upload-extensions\",
|
||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||
}"
|
||||
|
||||
- name: Extract postgres-extensions from container
|
||||
run: |
|
||||
rm -rf ./extensions-to-upload # Just in case
|
||||
mkdir -p extensions-to-upload
|
||||
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
|
||||
|
||||
- name: Upload postgres-extensions to S3
|
||||
run: |
|
||||
for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
|
||||
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
|
||||
done
|
||||
|
||||
- name: Cleanup
|
||||
if: ${{ always() && steps.create-container.outputs.EID }}
|
||||
run: |
|
||||
docker rm ${{ steps.create-container.outputs.EID }} || true
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/build_and_upload_extensions.yml/dispatches \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"ref\": \"main\",
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"build-and-upload-extensions\",
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"remote_branch_name\": \"${{ github.ref_name }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
deploy:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
|
||||
needs: [ promote-images, tag, regress-tests ]
|
||||
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
|
||||
13
CODEOWNERS
13
CODEOWNERS
@@ -1,11 +1,12 @@
|
||||
/compute_tools/ @neondatabase/control-plane
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/control_plane/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/pageserver/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
|
||||
/pageserver/ @neondatabase/compute @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/proxy/ @neondatabase/control-plane
|
||||
/proxy/ @neondatabase/proxy
|
||||
/safekeeper/ @neondatabase/safekeepers
|
||||
/vendor/ @neondatabase/compute
|
||||
|
||||
247
Cargo.lock
generated
247
Cargo.lock
generated
@@ -190,7 +190,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -201,7 +201,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -553,12 +553,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.6.18"
|
||||
version = "0.6.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8175979259124331c1d7bf6586ee7e0da434155e4b2d48ec2c8386281d8df39"
|
||||
checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum-core",
|
||||
"base64 0.21.1",
|
||||
"bitflags",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
@@ -573,7 +574,13 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_path_to_error",
|
||||
"serde_urlencoded",
|
||||
"sha1",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-tungstenite 0.20.0",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
@@ -673,7 +680,7 @@ dependencies = [
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
"shlex",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
"which",
|
||||
]
|
||||
|
||||
@@ -765,6 +772,19 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "cgroups-rs"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fb3af90c8d48ad5f432d8afb521b5b40c2a2fce46dd60e05912de51c47fba64"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"nix 0.25.1",
|
||||
"regex",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.24"
|
||||
@@ -849,7 +869,7 @@ dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -907,6 +927,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"cfg-if",
|
||||
"chrono",
|
||||
"clap",
|
||||
"compute_api",
|
||||
@@ -925,6 +946,7 @@ dependencies = [
|
||||
"tar",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
@@ -932,6 +954,7 @@ dependencies = [
|
||||
"tracing-utils",
|
||||
"url",
|
||||
"utils",
|
||||
"vm_monitor",
|
||||
"workspace_hack",
|
||||
"zstd",
|
||||
]
|
||||
@@ -978,7 +1001,7 @@ dependencies = [
|
||||
"comfy-table",
|
||||
"compute_api",
|
||||
"git-version",
|
||||
"nix",
|
||||
"nix 0.26.2",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"postgres",
|
||||
@@ -1184,7 +1207,7 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1195,7 +1218,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1260,7 +1283,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1316,7 +1339,7 @@ dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1512,7 +1535,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1863,8 +1886,8 @@ dependencies = [
|
||||
"hyper",
|
||||
"pin-project",
|
||||
"tokio",
|
||||
"tokio-tungstenite",
|
||||
"tungstenite",
|
||||
"tokio-tungstenite 0.18.0",
|
||||
"tungstenite 0.18.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1928,6 +1951,19 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inotify"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"futures-core",
|
||||
"inotify-sys",
|
||||
"libc",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inotify-sys"
|
||||
version = "0.1.5"
|
||||
@@ -2251,6 +2287,18 @@ dependencies = [
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.25.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.26.2"
|
||||
@@ -2285,7 +2333,7 @@ dependencies = [
|
||||
"crossbeam-channel",
|
||||
"filetime",
|
||||
"fsevent-sys",
|
||||
"inotify",
|
||||
"inotify 0.9.6",
|
||||
"kqueue",
|
||||
"libc",
|
||||
"mio",
|
||||
@@ -2293,6 +2341,15 @@ dependencies = [
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ntapi"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint"
|
||||
version = "0.4.3"
|
||||
@@ -2386,7 +2443,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2573,7 +2630,7 @@ dependencies = [
|
||||
"hyper",
|
||||
"itertools",
|
||||
"metrics",
|
||||
"nix",
|
||||
"nix 0.26.2",
|
||||
"num-traits",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
@@ -2596,6 +2653,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"signal-hook",
|
||||
"smallvec",
|
||||
"storage_broker",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
@@ -2773,7 +2831,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2970,7 +3028,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2981,9 +3039,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.64"
|
||||
version = "1.0.66"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
|
||||
checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
@@ -3145,9 +3203,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.27"
|
||||
version = "1.0.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
|
||||
checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
@@ -3569,9 +3627,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.100.1"
|
||||
version = "0.100.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b"
|
||||
checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
|
||||
dependencies = [
|
||||
"ring",
|
||||
"untrusted",
|
||||
@@ -3798,22 +3856,22 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.163"
|
||||
version = "1.0.183"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
|
||||
checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.163"
|
||||
version = "1.0.183"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
|
||||
checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3827,6 +3885,16 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_path_to_error"
|
||||
version = "0.1.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4beec8bce849d58d06238cb50db2e1c417cfeafa4c63f692b15c82b7c80f8335"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_spanned"
|
||||
version = "0.6.2"
|
||||
@@ -3873,7 +3941,7 @@ dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3972,9 +4040,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.10.0"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
|
||||
checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
@@ -4111,9 +4179,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.16"
|
||||
version = "2.0.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01"
|
||||
checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -4139,14 +4207,29 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tar"
|
||||
version = "0.4.38"
|
||||
name = "sysinfo"
|
||||
version = "0.29.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
|
||||
checksum = "165d6d8539689e3d3bc8b98ac59541e1f21c7de7c85d60dc80e43ae0ed2113db"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
"ntapi",
|
||||
"once_cell",
|
||||
"rayon",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tar"
|
||||
version = "0.4.40"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
|
||||
dependencies = [
|
||||
"filetime",
|
||||
"libc",
|
||||
"xattr 0.2.3",
|
||||
"xattr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4228,7 +4311,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4343,7 +4426,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4437,7 +4520,7 @@ dependencies = [
|
||||
"redox_syscall 0.3.5",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"xattr 1.0.0",
|
||||
"xattr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4449,7 +4532,19 @@ dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite",
|
||||
"tungstenite 0.18.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tungstenite"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b2dbec703c26b00d74844519606ef15d09a7d6857860f84ad223dec002ddea2"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.20.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4641,7 +4736,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4770,6 +4865,25 @@ dependencies = [
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tungstenite"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e862a1c4128df0112ab625f55cd5c934bcb4312ba80b39ae4b4835a3fd58e649"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"data-encoding",
|
||||
"http",
|
||||
"httparse",
|
||||
"log",
|
||||
"rand",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"url",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.16.0"
|
||||
@@ -4897,7 +5011,7 @@ dependencies = [
|
||||
"hyper",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix",
|
||||
"nix 0.26.2",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"pq_proto",
|
||||
@@ -4915,6 +5029,7 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
@@ -4951,6 +5066,28 @@ version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||
|
||||
[[package]]
|
||||
name = "vm_monitor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"cgroups-rs",
|
||||
"clap",
|
||||
"futures",
|
||||
"inotify 0.10.2",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sysinfo",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "vsimd"
|
||||
version = "0.8.0"
|
||||
@@ -5021,7 +5158,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
@@ -5055,7 +5192,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
@@ -5340,12 +5477,14 @@ name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"bytes",
|
||||
"cc",
|
||||
"chrono",
|
||||
"clap",
|
||||
"clap_builder",
|
||||
"crossbeam-utils",
|
||||
"digest",
|
||||
"either",
|
||||
"fail",
|
||||
"futures",
|
||||
@@ -5354,6 +5493,7 @@ dependencies = [
|
||||
"futures-executor",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"libc",
|
||||
"log",
|
||||
@@ -5372,9 +5512,10 @@ dependencies = [
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"smallvec",
|
||||
"socket2 0.4.9",
|
||||
"syn 1.0.109",
|
||||
"syn 2.0.16",
|
||||
"syn 2.0.28",
|
||||
"tokio",
|
||||
"tokio-rustls 0.23.4",
|
||||
"tokio-util",
|
||||
@@ -5383,7 +5524,6 @@ dependencies = [
|
||||
"tower",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
]
|
||||
|
||||
@@ -5404,15 +5544,6 @@ dependencies = [
|
||||
"time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xattr"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xattr"
|
||||
version = "1.0.0"
|
||||
|
||||
@@ -23,6 +23,7 @@ members = [
|
||||
"libs/remote_storage",
|
||||
"libs/tracing-utils",
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
"libs/vm_monitor",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -41,12 +42,14 @@ aws-sdk-s3 = "0.27"
|
||||
aws-smithy-http = "0.55"
|
||||
aws-credential-types = "0.55"
|
||||
aws-types = "0.55"
|
||||
axum = { version = "0.6.20", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.65"
|
||||
bstr = "1.0"
|
||||
byteorder = "1.4"
|
||||
bytes = "1.0"
|
||||
cfg-if = "1.0.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
close_fds = "0.3.2"
|
||||
@@ -74,6 +77,7 @@ humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.9"
|
||||
inotify = "0.10.2"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "8"
|
||||
libc = "0.2"
|
||||
@@ -105,12 +109,14 @@ rustls = "0.20"
|
||||
rustls-pemfile = "1"
|
||||
rustls-split = "0.3"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "2.0"
|
||||
sha2 = "0.10.2"
|
||||
signal-hook = "0.3"
|
||||
smallvec = "1.11"
|
||||
socket2 = "0.5"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
@@ -133,7 +139,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.19.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
url = "2.2"
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
@@ -169,6 +175,7 @@ storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main br
|
||||
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
@@ -211,8 +211,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
|
||||
echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
@@ -764,29 +764,6 @@ RUN rm -r /usr/local/pgsql/include
|
||||
# if they were to be used by other libraries.
|
||||
RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Extenstion only
|
||||
#
|
||||
#########################################################################################
|
||||
FROM python:3.9-slim-bullseye AS generate-ext-index
|
||||
ARG PG_VERSION
|
||||
ARG BUILD_TAG
|
||||
RUN apt update && apt install -y zstd
|
||||
|
||||
# copy the control files here
|
||||
COPY --from=kq-imcx-pg-build /extensions/ /extensions/
|
||||
COPY --from=pg-anon-pg-build /extensions/ /extensions/
|
||||
COPY --from=postgis-build /extensions/ /extensions/
|
||||
COPY scripts/combine_control_files.py ./combine_control_files.py
|
||||
RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
|
||||
|
||||
FROM scratch AS postgres-extensions
|
||||
# After the transition this layer will include all extensitons.
|
||||
# As for now, it's only a couple for testing purposses
|
||||
COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
|
||||
COPY --from=generate-ext-index /ext_index.json /ext_index.json
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Final layer
|
||||
|
||||
@@ -8,6 +8,7 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
chrono.workspace = true
|
||||
cfg-if.workspace = true
|
||||
clap.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
@@ -23,6 +24,7 @@ tar.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
@@ -34,4 +36,5 @@ utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
||||
zstd = "0.12.4"
|
||||
|
||||
@@ -19,9 +19,10 @@ Also `compute_ctl` spawns two separate service threads:
|
||||
- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
last activity requests.
|
||||
|
||||
If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
|
||||
compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
|
||||
downscaling and (eventually) will request immediate upscaling under resource pressure.
|
||||
If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
|
||||
`vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
|
||||
`vm-monitor` communicates with the VM autoscaling system. It coordinates
|
||||
downscaling and requests immediate upscaling under resource pressure.
|
||||
|
||||
Usage example:
|
||||
```sh
|
||||
|
||||
@@ -20,9 +20,10 @@
|
||||
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
//! last activity requests.
|
||||
//!
|
||||
//! If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
|
||||
//! compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
|
||||
//! downscaling and (eventually) will request immediate upscaling under resource pressure.
|
||||
//! If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
|
||||
//! `vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
|
||||
//! `vm-monitor` communicates with the VM autoscaling system. It coordinates
|
||||
//! downscaling and requests immediate upscaling under resource pressure.
|
||||
//!
|
||||
//! Usage example:
|
||||
//! ```sh
|
||||
@@ -35,7 +36,6 @@
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
||||
@@ -271,6 +271,55 @@ fn main() -> Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
|
||||
// because it requires cgroups.
|
||||
cfg_if::cfg_if! {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
use std::env;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
|
||||
let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
|
||||
let cgroup = matches.get_one::<String>("cgroup");
|
||||
|
||||
// Only make a runtime if we need to.
|
||||
// Note: it seems like you can make a runtime in an inner scope and
|
||||
// if you start a task in it it won't be dropped. However, make it
|
||||
// in the outermost scope just to be safe.
|
||||
let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
|
||||
(None, None) => None,
|
||||
(None, Some(_)) => {
|
||||
warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
|
||||
None
|
||||
}
|
||||
(Some(_), None) => {
|
||||
panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
|
||||
}
|
||||
(Some(_), Some(_)) => Some(
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.worker_threads(4)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create tokio runtime for monitor"),
|
||||
),
|
||||
};
|
||||
|
||||
// This token is used internally by the monitor to clean up all threads
|
||||
let token = CancellationToken::new();
|
||||
|
||||
let vm_monitor = &rt.as_ref().map(|rt| {
|
||||
rt.spawn(vm_monitor::start(
|
||||
Box::leak(Box::new(vm_monitor::Args {
|
||||
cgroup: cgroup.cloned(),
|
||||
pgconnstr: file_cache_connstr.cloned(),
|
||||
addr: vm_monitor_addr.cloned().unwrap(),
|
||||
})),
|
||||
token.clone(),
|
||||
))
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
||||
// propagate to Postgres and it will be shut down as well.
|
||||
if let Some(mut pg) = pg {
|
||||
@@ -284,6 +333,24 @@ fn main() -> Result<()> {
|
||||
exit_code = ecode.code()
|
||||
}
|
||||
|
||||
// Terminate the vm_monitor so it releases the file watcher on
|
||||
// /sys/fs/cgroup/neon-postgres.
|
||||
// Note: the vm-monitor only runs on linux because it requires cgroups.
|
||||
cfg_if::cfg_if! {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
if let Some(handle) = vm_monitor {
|
||||
// Kills all threads spawned by the monitor
|
||||
token.cancel();
|
||||
// Kills the actual task running the monitor
|
||||
handle.abort();
|
||||
|
||||
// If handle is some, rt must have been used to produce it, and
|
||||
// hence is also some
|
||||
rt.unwrap().shutdown_timeout(Duration::from_secs(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Maybe sync safekeepers again, to speed up next startup
|
||||
let compute_state = compute.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
@@ -393,6 +460,29 @@ fn cli() -> clap::Command {
|
||||
.long("remote-ext-config")
|
||||
.value_name("REMOTE_EXT_CONFIG"),
|
||||
)
|
||||
// TODO(fprasx): we currently have default arguments because the cloud PR
|
||||
// to pass them in hasn't been merged yet. We should get rid of them once
|
||||
// the PR is merged.
|
||||
.arg(
|
||||
Arg::new("vm-monitor-addr")
|
||||
.long("vm-monitor-addr")
|
||||
.default_value("0.0.0.0:10301")
|
||||
.value_name("VM_MONITOR_ADDR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("cgroup")
|
||||
.long("cgroup")
|
||||
.default_value("neon-postgres")
|
||||
.value_name("CGROUP"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("filecache-connstr")
|
||||
.long("filecache-connstr")
|
||||
.default_value(
|
||||
"host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
|
||||
)
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
@@ -175,6 +176,27 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
}
|
||||
}
|
||||
|
||||
/// If we are a VM, returns a [`Command`] that will run in the `neon-postgres`
|
||||
/// cgroup. Otherwise returns the default `Command::new(cmd)`
|
||||
///
|
||||
/// This function should be used to start postgres, as it will start it in the
|
||||
/// neon-postgres cgroup if we are a VM. This allows autoscaling to control
|
||||
/// postgres' resource usage. The cgroup will exist in VMs because vm-builder
|
||||
/// creates it during the sysinit phase of its inittab.
|
||||
fn maybe_cgexec(cmd: &str) -> Command {
|
||||
// The cplane sets this env var for autoscaling computes.
|
||||
// use `var_os` so we don't have to worry about the variable being valid
|
||||
// unicode. Should never be an concern . . . but just in case
|
||||
if env::var_os("AUTOSCALING").is_some() {
|
||||
let mut command = Command::new("cgexec");
|
||||
command.args(["-g", "memory:neon-postgres"]);
|
||||
command.arg(cmd);
|
||||
command
|
||||
} else {
|
||||
Command::new(cmd)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
|
||||
/// that we give to customers
|
||||
fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
@@ -451,7 +473,7 @@ impl ComputeNode {
|
||||
pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let sync_handle = Command::new(&self.pgbin)
|
||||
let sync_handle = maybe_cgexec(&self.pgbin)
|
||||
.args(["--sync-safekeepers"])
|
||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||
.envs(if let Some(storage_auth_token) = &storage_auth_token {
|
||||
@@ -586,7 +608,7 @@ impl ComputeNode {
|
||||
|
||||
// Start postgres
|
||||
info!("starting postgres");
|
||||
let mut pg = Command::new(&self.pgbin)
|
||||
let mut pg = maybe_cgexec(&self.pgbin)
|
||||
.args(["-D", pgdata])
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
@@ -614,7 +636,7 @@ impl ComputeNode {
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Run postgres as a child process.
|
||||
let mut pg = Command::new(&self.pgbin)
|
||||
let mut pg = maybe_cgexec(&self.pgbin)
|
||||
.args(["-D", &self.pgdata])
|
||||
.envs(if let Some(storage_auth_token) = &storage_auth_token {
|
||||
vec![("NEON_AUTH_TOKEN", storage_auth_token)]
|
||||
|
||||
@@ -4,7 +4,12 @@
|
||||
# to your expectations and requirements.
|
||||
|
||||
# Root options
|
||||
targets = []
|
||||
targets = [
|
||||
{ triple = "x86_64-unknown-linux-gnu" },
|
||||
{ triple = "aarch64-unknown-linux-gnu" },
|
||||
{ triple = "aarch64-apple-darwin" },
|
||||
{ triple = "x86_64-apple-darwin" },
|
||||
]
|
||||
all-features = false
|
||||
no-default-features = false
|
||||
feature-depth = 1
|
||||
@@ -18,7 +23,7 @@ vulnerability = "deny"
|
||||
unmaintained = "warn"
|
||||
yanked = "warn"
|
||||
notice = "warn"
|
||||
ignore = []
|
||||
ignore = ["RUSTSEC-2023-0052"]
|
||||
|
||||
# This section is considered when running `cargo deny check licenses`
|
||||
# More documentation for the licenses section can be found here:
|
||||
|
||||
@@ -26,6 +26,7 @@ serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
||||
|
||||
@@ -1,18 +1,31 @@
|
||||
use std::fmt::{Debug, Display};
|
||||
|
||||
use futures::Future;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
|
||||
pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
|
||||
|
||||
pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
|
||||
pub async fn exponential_backoff(
|
||||
n: u32,
|
||||
base_increment: f64,
|
||||
max_seconds: f64,
|
||||
cancel: &CancellationToken,
|
||||
) {
|
||||
let backoff_duration_seconds =
|
||||
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
|
||||
if backoff_duration_seconds > 0.0 {
|
||||
tracing::info!(
|
||||
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
|
||||
|
||||
drop(
|
||||
tokio::time::timeout(
|
||||
std::time::Duration::from_secs_f64(backoff_duration_seconds),
|
||||
cancel.cancelled(),
|
||||
)
|
||||
.await,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,28 +37,57 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
|
||||
}
|
||||
}
|
||||
|
||||
/// Configure cancellation for a retried operation: when to cancel (the token), and
|
||||
/// what kind of error to return on cancellation
|
||||
pub struct Cancel<E, CF>
|
||||
where
|
||||
E: Display + Debug + 'static,
|
||||
CF: Fn() -> E,
|
||||
{
|
||||
token: CancellationToken,
|
||||
on_cancel: CF,
|
||||
}
|
||||
|
||||
impl<E, CF> Cancel<E, CF>
|
||||
where
|
||||
E: Display + Debug + 'static,
|
||||
CF: Fn() -> E,
|
||||
{
|
||||
pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
|
||||
Self { token, on_cancel }
|
||||
}
|
||||
}
|
||||
|
||||
/// retries passed operation until one of the following conditions are met:
|
||||
/// Encountered error is considered as permanent (non-retryable)
|
||||
/// Retries have been exhausted.
|
||||
/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
|
||||
/// When attempts cross `warn_threshold` function starts to emit log warnings.
|
||||
/// `description` argument is added to log messages. Its value should identify the `op` is doing
|
||||
pub async fn retry<T, O, F, E>(
|
||||
/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
|
||||
/// to drop out promptly on shutdown.
|
||||
pub async fn retry<T, O, F, E, CF>(
|
||||
mut op: O,
|
||||
is_permanent: impl Fn(&E) -> bool,
|
||||
warn_threshold: u32,
|
||||
max_retries: u32,
|
||||
description: &str,
|
||||
cancel: Cancel<E, CF>,
|
||||
) -> Result<T, E>
|
||||
where
|
||||
// Not std::error::Error because anyhow::Error doesnt implement it.
|
||||
// For context see https://github.com/dtolnay/anyhow/issues/63
|
||||
E: Display + Debug,
|
||||
E: Display + Debug + 'static,
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
CF: Fn() -> E,
|
||||
{
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
if cancel.token.is_cancelled() {
|
||||
return Err((cancel.on_cancel)());
|
||||
}
|
||||
|
||||
let result = op().await;
|
||||
match result {
|
||||
Ok(_) => {
|
||||
@@ -80,6 +122,7 @@ where
|
||||
attempts,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
&cancel.token,
|
||||
)
|
||||
.await;
|
||||
attempts += 1;
|
||||
@@ -132,6 +175,7 @@ mod tests {
|
||||
1,
|
||||
1,
|
||||
"work",
|
||||
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -157,6 +201,7 @@ mod tests {
|
||||
2,
|
||||
2,
|
||||
"work",
|
||||
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -179,6 +224,7 @@ mod tests {
|
||||
2,
|
||||
2,
|
||||
"work",
|
||||
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
31
libs/vm_monitor/Cargo.toml
Normal file
31
libs/vm_monitor/Cargo.toml
Normal file
@@ -0,0 +1,31 @@
|
||||
[package]
|
||||
name = "vm_monitor"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[[bin]]
|
||||
name = "vm-monitor"
|
||||
path = "./src/bin/monitor.rs"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
axum.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
inotify.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
sysinfo.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
cgroups-rs = "0.3.3"
|
||||
34
libs/vm_monitor/README.md
Normal file
34
libs/vm_monitor/README.md
Normal file
@@ -0,0 +1,34 @@
|
||||
# `vm-monitor`
|
||||
|
||||
The `vm-monitor` (or just monitor) is a core component of the autoscaling system,
|
||||
along with the `autoscale-scheduler` and the `autoscaler-agent`s. The monitor has
|
||||
two primary roles: 1) notifying agents when immediate upscaling is necessary due
|
||||
to memory conditions and 2) managing Postgres' file cache and a cgroup to carry
|
||||
out upscaling and downscaling decisions.
|
||||
|
||||
## More on scaling
|
||||
|
||||
We scale CPU and memory using NeonVM, our in-house QEMU tool for use with Kubernetes.
|
||||
To control thresholds for receiving memory usage notifications, we start Postgres
|
||||
in the `neon-postgres` cgroup and set its `memory.{max,high}`.
|
||||
|
||||
* See also: [`neondatabase/autoscaling`](https://github.com/neondatabase/autoscaling/)
|
||||
* See also: [`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor/),
|
||||
where initial development of the monitor happened. The repository is no longer
|
||||
maintained but the commit history may be useful for debugging.
|
||||
|
||||
## Structure
|
||||
|
||||
The `vm-monitor` is loosely comprised of a few systems. These are:
|
||||
* the server: this is just a simple `axum` server that accepts requests and
|
||||
upgrades them to websocket connections. The server only allows one connection at
|
||||
a time. This means that upon receiving a new connection, the server will terminate
|
||||
and old one if it exists.
|
||||
* the filecache: a struct that allows communication with the Postgres file cache.
|
||||
On startup, we connect to the filecache and hold on to the connection for the
|
||||
entire monitor lifetime.
|
||||
* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
|
||||
listening for `memory.high` events and setting its `memory.{high,max}` values.
|
||||
* the runner: the runner marries the filecache and cgroup watcher together,
|
||||
communicating with the agent throught the `Dispatcher`, and then calling filecache
|
||||
and cgroup watcher functions as needed to upscale and downscale
|
||||
33
libs/vm_monitor/src/bin/monitor.rs
Normal file
33
libs/vm_monitor/src/bin/monitor.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
// We expose a standalone binary _and_ start the monitor in `compute_ctl` so that
|
||||
// we can test the monitor as part of the entire autoscaling system in
|
||||
// neondatabase/autoscaling.
|
||||
//
|
||||
// The monitor was previously started by vm-builder, and for testing purposes,
|
||||
// we can mimic that setup with this binary.
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
use clap::Parser;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use vm_monitor::Args;
|
||||
|
||||
let subscriber = tracing_subscriber::fmt::Subscriber::builder()
|
||||
.json()
|
||||
.with_file(true)
|
||||
.with_line_number(true)
|
||||
.with_span_list(true)
|
||||
.with_env_filter(EnvFilter::from_default_env())
|
||||
.finish();
|
||||
tracing::subscriber::set_global_default(subscriber)?;
|
||||
|
||||
let args: &'static Args = Box::leak(Box::new(Args::parse()));
|
||||
let token = CancellationToken::new();
|
||||
vm_monitor::start(args, token).await
|
||||
}
|
||||
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
fn main() {
|
||||
panic!("the monitor requires cgroups, which are only available on linux")
|
||||
}
|
||||
693
libs/vm_monitor/src/cgroup.rs
Normal file
693
libs/vm_monitor/src/cgroup.rs
Normal file
@@ -0,0 +1,693 @@
|
||||
use std::{
|
||||
fmt::{Debug, Display},
|
||||
fs,
|
||||
pin::pin,
|
||||
sync::atomic::{AtomicU64, Ordering},
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use cgroups_rs::{
|
||||
freezer::FreezerController,
|
||||
hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
|
||||
memory::MemController,
|
||||
MaxValue,
|
||||
Subsystem::{Freezer, Mem},
|
||||
};
|
||||
use inotify::{EventStream, Inotify, WatchMask};
|
||||
use tokio::sync::mpsc::{self, error::TryRecvError};
|
||||
use tokio::time::{Duration, Instant};
|
||||
use tokio_stream::{Stream, StreamExt};
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::protocol::Resources;
|
||||
use crate::MiB;
|
||||
|
||||
/// Monotonically increasing counter of the number of memory.high events
|
||||
/// the cgroup has experienced.
|
||||
///
|
||||
/// We use this to determine if a modification to the `memory.events` file actually
|
||||
/// changed the `high` field. If not, we don't care about the change. When we
|
||||
/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
|
||||
/// to see if it changed since last time.
|
||||
pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
/// Monotonically increasing counter that gives each cgroup event a unique id.
|
||||
///
|
||||
/// This allows us to answer questions like "did this upscale arrive before this
|
||||
/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
|
||||
/// with a sequence number. As such, prefer to used the `Sequenced` type rather
|
||||
/// than this static directly.
|
||||
static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
/// A memory event type reported in memory.events.
|
||||
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
|
||||
pub enum MemoryEvent {
|
||||
Low,
|
||||
High,
|
||||
Max,
|
||||
Oom,
|
||||
OomKill,
|
||||
OomGroupKill,
|
||||
}
|
||||
|
||||
impl MemoryEvent {
|
||||
fn as_str(&self) -> &str {
|
||||
match self {
|
||||
MemoryEvent::Low => "low",
|
||||
MemoryEvent::High => "high",
|
||||
MemoryEvent::Max => "max",
|
||||
MemoryEvent::Oom => "oom",
|
||||
MemoryEvent::OomKill => "oom_kill",
|
||||
MemoryEvent::OomGroupKill => "oom_group_kill",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for MemoryEvent {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for a `CgroupWatcher`
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Config {
|
||||
// The target difference between the total memory reserved for the cgroup
|
||||
// and the value of the cgroup's memory.high.
|
||||
//
|
||||
// In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
|
||||
// use (equal to system memory, minus whatever's taken out for the file cache).
|
||||
oom_buffer_bytes: u64,
|
||||
|
||||
// The amount of memory, in bytes, below a proposed new value for
|
||||
// memory.high that the cgroup's memory usage must be for us to downscale
|
||||
//
|
||||
// In other words, we can downscale only when:
|
||||
//
|
||||
// memory.current + memory_high_buffer_bytes < (proposed) memory.high
|
||||
//
|
||||
// TODO: there's some minor issues with this approach -- in particular, that we might have
|
||||
// memory in use by the kernel's page cache that we're actually ok with getting rid of.
|
||||
pub(crate) memory_high_buffer_bytes: u64,
|
||||
|
||||
// The maximum duration, in milliseconds, that we're allowed to pause
|
||||
// the cgroup for while waiting for the autoscaler-agent to upscale us
|
||||
max_upscale_wait: Duration,
|
||||
|
||||
// The required minimum time, in milliseconds, that we must wait before re-freezing
|
||||
// the cgroup while waiting for the autoscaler-agent to upscale us.
|
||||
do_not_freeze_more_often_than: Duration,
|
||||
|
||||
// The amount of memory, in bytes, that we should periodically increase memory.high
|
||||
// by while waiting for the autoscaler-agent to upscale us.
|
||||
//
|
||||
// This exists to avoid the excessive throttling that happens when a cgroup is above its
|
||||
// memory.high for too long. See more here:
|
||||
// https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
|
||||
memory_high_increase_by_bytes: u64,
|
||||
|
||||
// The period, in milliseconds, at which we should repeatedly increase the value
|
||||
// of the cgroup's memory.high while we're waiting on upscaling and memory.high
|
||||
// is still being hit.
|
||||
//
|
||||
// Technically speaking, this actually serves as a rate limit to moderate responding to
|
||||
// memory.high events, but these are roughly equivalent if the process is still allocating
|
||||
// memory.
|
||||
memory_high_increase_every: Duration,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Calculate the new value for the cgroups memory.high based on system memory
|
||||
pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
|
||||
total_system_mem.saturating_sub(self.oom_buffer_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
oom_buffer_bytes: 100 * MiB,
|
||||
memory_high_buffer_bytes: 100 * MiB,
|
||||
// while waiting for upscale, don't freeze for more than 20ms every 1s
|
||||
max_upscale_wait: Duration::from_millis(20),
|
||||
do_not_freeze_more_often_than: Duration::from_millis(1000),
|
||||
// while waiting for upscale, increase memory.high by 10MiB every 25ms
|
||||
memory_high_increase_by_bytes: 10 * MiB,
|
||||
memory_high_increase_every: Duration::from_millis(25),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Used to represent data that is associated with a certain point in time, such
|
||||
/// as an upscale request or memory.high event.
|
||||
///
|
||||
/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
|
||||
/// a unique sequence number. Sequence numbers are monotonically increasing,
|
||||
/// allowing us to answer questions like "did this upscale happen after this
|
||||
/// memory.high event?" by comparing the sequence numbers of the two events.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Sequenced<T> {
|
||||
seqnum: u64,
|
||||
data: T,
|
||||
}
|
||||
|
||||
impl<T> Sequenced<T> {
|
||||
pub fn new(data: T) -> Self {
|
||||
Self {
|
||||
seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
|
||||
data,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Responds to `MonitorEvents` to manage the cgroup: preventing it from being
|
||||
/// OOM killed or throttling.
|
||||
///
|
||||
/// The `CgroupWatcher` primarily achieves this by reading from a stream of
|
||||
/// `MonitorEvent`s. See `main_signals_loop` for details on how to keep the
|
||||
/// cgroup happy.
|
||||
#[derive(Debug)]
|
||||
pub struct CgroupWatcher {
|
||||
pub config: Config,
|
||||
|
||||
/// The sequence number of the last upscale.
|
||||
///
|
||||
/// If we receive a memory.high event that has a _lower_ sequence number than
|
||||
/// `last_upscale_seqnum`, then we know it occured before the upscale, and we
|
||||
/// can safely ignore it.
|
||||
///
|
||||
/// Note: Like the `events` field, this doesn't _need_ interior mutability but we
|
||||
/// use it anyways so that methods take `&self`, not `&mut self`.
|
||||
last_upscale_seqnum: AtomicU64,
|
||||
|
||||
/// A channel on which we send messages to request upscale from the dispatcher.
|
||||
upscale_requester: mpsc::Sender<()>,
|
||||
|
||||
/// The actual cgroup we are watching and managing.
|
||||
cgroup: cgroups_rs::Cgroup,
|
||||
}
|
||||
|
||||
/// Read memory.events for the desired event type.
|
||||
///
|
||||
/// `path` specifies the path to the desired `memory.events` file.
|
||||
/// For more info, see the `memory.events` section of the [kernel docs]
|
||||
/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
|
||||
fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
|
||||
let contents = fs::read_to_string(path)
|
||||
.with_context(|| format!("failed to read memory.events from {path}"))?;
|
||||
|
||||
// Then contents of the file look like:
|
||||
// low 42
|
||||
// high 101
|
||||
// ...
|
||||
contents
|
||||
.lines()
|
||||
.filter_map(|s| s.split_once(' '))
|
||||
.find(|(e, _)| *e == event.as_str())
|
||||
.ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
|
||||
.and_then(|(_, count)| {
|
||||
count
|
||||
.parse::<u64>()
|
||||
.with_context(|| format!("failed to parse memory.{event} as u64"))
|
||||
})
|
||||
}
|
||||
|
||||
/// Create an event stream that produces events whenever the file at the provided
|
||||
/// path is modified.
|
||||
fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
|
||||
info!("creating file watcher for {path}");
|
||||
let inotify = Inotify::init().context("failed to initialize file watcher")?;
|
||||
inotify
|
||||
.watches()
|
||||
.add(path, WatchMask::MODIFY)
|
||||
.with_context(|| format!("failed to start watching {path}"))?;
|
||||
inotify
|
||||
// The inotify docs use [0u8; 1024] so we'll just copy them. We only need
|
||||
// to store one event at a time - if the event gets written over, that's
|
||||
// ok. We still see that there is an event. For more information, see:
|
||||
// https://man7.org/linux/man-pages/man7/inotify.7.html
|
||||
.into_event_stream([0u8; 1024])
|
||||
.context("failed to start inotify event stream")
|
||||
}
|
||||
|
||||
impl CgroupWatcher {
|
||||
/// Create a new `CgroupWatcher`.
|
||||
#[tracing::instrument(skip_all, fields(%name))]
|
||||
pub fn new(
|
||||
name: String,
|
||||
// A channel on which to send upscale requests
|
||||
upscale_requester: mpsc::Sender<()>,
|
||||
) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
|
||||
// TODO: clarify exactly why we need v2
|
||||
// Make sure cgroups v2 (aka unified) are supported
|
||||
if !is_cgroup2_unified_mode() {
|
||||
anyhow::bail!("cgroups v2 not supported");
|
||||
}
|
||||
let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);
|
||||
|
||||
// Start monitoring the cgroup for memory events. In general, for
|
||||
// cgroups v2 (aka unified), metrics are reported in files like
|
||||
// > `/sys/fs/cgroup/{name}/{metric}`
|
||||
// We are looking for `memory.high` events, which are stored in the
|
||||
// file `memory.events`. For more info, see the `memory.events` section
|
||||
// of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
|
||||
let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
|
||||
let memory_events = create_file_watcher(&path)
|
||||
.with_context(|| format!("failed to create event watcher for {path}"))?
|
||||
// This would be nice with with .inspect_err followed by .ok
|
||||
.filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
|
||||
Ok(high) => Some(high),
|
||||
Err(error) => {
|
||||
// TODO: Might want to just panic here
|
||||
warn!(?error, "failed to read high events count from {}", &path);
|
||||
None
|
||||
}
|
||||
})
|
||||
// Only report the event if the memory.high count increased
|
||||
.filter_map(|high| {
|
||||
if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
|
||||
Some(high)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.map(Sequenced::new);
|
||||
|
||||
let initial_count = get_event_count(
|
||||
&format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
|
||||
MemoryEvent::High,
|
||||
)?;
|
||||
|
||||
info!(initial_count, "initial memory.high event count");
|
||||
|
||||
// Hard update `MEMORY_EVENT_COUNT` since there could have been processes
|
||||
// running in the cgroup before that caused it to be non-zero.
|
||||
MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
|
||||
|
||||
Ok((
|
||||
Self {
|
||||
cgroup,
|
||||
upscale_requester,
|
||||
last_upscale_seqnum: AtomicU64::new(0),
|
||||
config: Default::default(),
|
||||
},
|
||||
memory_events,
|
||||
))
|
||||
}
|
||||
|
||||
/// The entrypoint for the `CgroupWatcher`.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn watch<E>(
|
||||
&self,
|
||||
// These are ~dependency injected~ (fancy, I know) because this function
|
||||
// should never return.
|
||||
// -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
|
||||
// -> therefore: if we want to stick it in an Arc so many threads can access
|
||||
// it, methods can never take mutable access.
|
||||
// - note: we use the Arc strategy so that a) we can call this function
|
||||
// right here and b) the runner can call the set/get_memory methods
|
||||
// -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
|
||||
// we just pass them in here instead of holding them in fields, as that
|
||||
// would require this method to take &mut self.
|
||||
mut upscales: mpsc::Receiver<Sequenced<Resources>>,
|
||||
events: E,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
E: Stream<Item = Sequenced<u64>>,
|
||||
{
|
||||
// There are several actions might do when receiving a `memory.high`,
|
||||
// such as freezing the cgroup, or increasing its `memory.high`. We don't
|
||||
// want to do these things too often (because postgres needs to run, and
|
||||
// we only have so much memory). These timers serve as rate limits for this.
|
||||
let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
|
||||
let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
|
||||
let mut events = pin!(events);
|
||||
|
||||
// Are we waiting to be upscaled? Could be true if we request upscale due
|
||||
// to a memory.high event and it does not arrive in time.
|
||||
let mut waiting_on_upscale = false;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
upscale = upscales.recv() => {
|
||||
let Sequenced { seqnum, data } = upscale
|
||||
.context("failed to listen on upscale notification channel")?;
|
||||
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
|
||||
info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
|
||||
}
|
||||
event = events.next() => {
|
||||
let Some(Sequenced { seqnum, .. }) = event else {
|
||||
bail!("failed to listen for memory.high events")
|
||||
};
|
||||
// The memory.high came before our last upscale, so we consider
|
||||
// it resolved
|
||||
if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
|
||||
info!(
|
||||
"received memory.high event, but it came before our last upscale -> ignoring it"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// The memory.high came after our latest upscale. We don't
|
||||
// want to do anything yet, so peek the next event in hopes
|
||||
// that it's an upscale.
|
||||
if let Some(upscale_num) = self
|
||||
.upscaled(&mut upscales)
|
||||
.context("failed to check if we were upscaled")?
|
||||
{
|
||||
if upscale_num > seqnum {
|
||||
info!(
|
||||
"received memory.high event, but it came before our last upscale -> ignoring it"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// If it's been long enough since we last froze, freeze the
|
||||
// cgroup and request upscale
|
||||
if wait_to_freeze.is_elapsed() {
|
||||
info!("received memory.high event -> requesting upscale");
|
||||
waiting_on_upscale = self
|
||||
.handle_memory_high_event(&mut upscales)
|
||||
.await
|
||||
.context("failed to handle upscale")?;
|
||||
wait_to_freeze
|
||||
.as_mut()
|
||||
.reset(Instant::now() + self.config.do_not_freeze_more_often_than);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Ok, we can't freeze, just request upscale
|
||||
if !waiting_on_upscale {
|
||||
info!("received memory.high event, but too soon to refreeze -> requesting upscale");
|
||||
|
||||
// Make check to make sure we haven't been upscaled in the
|
||||
// meantine (can happen if the agent independently decides
|
||||
// to upscale us again)
|
||||
if self
|
||||
.upscaled(&mut upscales)
|
||||
.context("failed to check if we were upscaled")?
|
||||
.is_some()
|
||||
{
|
||||
info!("no need to request upscaling because we got upscaled");
|
||||
continue;
|
||||
}
|
||||
self.upscale_requester
|
||||
.send(())
|
||||
.await
|
||||
.context("failed to request upscale")?;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Shoot, we can't freeze or and we're still waiting on upscale,
|
||||
// increase memory.high to reduce throttling
|
||||
if wait_to_increase_memory_high.is_elapsed() {
|
||||
info!(
|
||||
"received memory.high event, \
|
||||
but too soon to refreeze and already requested upscale \
|
||||
-> increasing memory.high"
|
||||
);
|
||||
|
||||
// Make check to make sure we haven't been upscaled in the
|
||||
// meantine (can happen if the agent independently decides
|
||||
// to upscale us again)
|
||||
if self
|
||||
.upscaled(&mut upscales)
|
||||
.context("failed to check if we were upscaled")?
|
||||
.is_some()
|
||||
{
|
||||
info!("no need to increase memory.high because got upscaled");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Request upscale anyways (the agent will handle deduplicating
|
||||
// requests)
|
||||
self.upscale_requester
|
||||
.send(())
|
||||
.await
|
||||
.context("failed to request upscale")?;
|
||||
|
||||
let memory_high =
|
||||
self.get_high_bytes().context("failed to get memory.high")?;
|
||||
let new_high = memory_high + self.config.memory_high_increase_by_bytes;
|
||||
info!(
|
||||
current_high_bytes = memory_high,
|
||||
new_high_bytes = new_high,
|
||||
"updating memory.high"
|
||||
);
|
||||
self.set_high_bytes(new_high)
|
||||
.context("failed to set memory.high")?;
|
||||
wait_to_increase_memory_high
|
||||
.as_mut()
|
||||
.reset(Instant::now() + self.config.memory_high_increase_every)
|
||||
}
|
||||
|
||||
// we can't do anything
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle a `memory.high`, returning whether we are still waiting on upscale
|
||||
/// by the time the function returns.
|
||||
///
|
||||
/// The general plan for handling a `memory.high` event is as follows:
|
||||
/// 1. Freeze the cgroup
|
||||
/// 2. Start a timer for `self.config.max_upscale_wait`
|
||||
/// 3. Request upscale
|
||||
/// 4. After the timer elapses or we receive upscale, thaw the cgroup.
|
||||
/// 5. Return whether or not we are still waiting for upscale. If we are,
|
||||
/// we'll increase the cgroups memory.high to avoid getting oom killed
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn handle_memory_high_event(
|
||||
&self,
|
||||
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
|
||||
) -> anyhow::Result<bool> {
|
||||
// Immediately freeze the cgroup before doing anything else.
|
||||
info!("received memory.high event -> freezing cgroup");
|
||||
self.freeze().context("failed to freeze cgroup")?;
|
||||
|
||||
// We'll use this for logging durations
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Await the upscale until we have to unfreeze
|
||||
let timed =
|
||||
tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
|
||||
|
||||
// Request the upscale
|
||||
info!(
|
||||
wait = ?self.config.max_upscale_wait,
|
||||
"sending request for immediate upscaling",
|
||||
);
|
||||
self.upscale_requester
|
||||
.send(())
|
||||
.await
|
||||
.context("failed to request upscale")?;
|
||||
|
||||
let waiting_on_upscale = match timed.await {
|
||||
Ok(Ok(())) => {
|
||||
info!(elapsed = ?start_time.elapsed(), "received upscale in time");
|
||||
false
|
||||
}
|
||||
// **important**: unfreeze the cgroup before ?-reporting the error
|
||||
Ok(Err(e)) => {
|
||||
info!("error waiting for upscale -> thawing cgroup");
|
||||
self.thaw()
|
||||
.context("failed to thaw cgroup after errored waiting for upscale")?;
|
||||
Err(e.context("failed to await upscale"))?
|
||||
}
|
||||
Err(_) => {
|
||||
info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
info!("thawing cgroup");
|
||||
self.thaw().context("failed to thaw cgroup")?;
|
||||
|
||||
Ok(waiting_on_upscale)
|
||||
}
|
||||
|
||||
/// Checks whether we were just upscaled, returning the upscale's sequence
|
||||
/// number if so.
|
||||
#[tracing::instrument(skip_all)]
|
||||
fn upscaled(
|
||||
&self,
|
||||
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
|
||||
) -> anyhow::Result<Option<u64>> {
|
||||
let Sequenced { seqnum, data } = match upscales.try_recv() {
|
||||
Ok(upscale) => upscale,
|
||||
Err(TryRecvError::Empty) => return Ok(None),
|
||||
Err(TryRecvError::Disconnected) => {
|
||||
bail!("upscale notification channel was disconnected")
|
||||
}
|
||||
};
|
||||
|
||||
// Make sure to update the last upscale sequence number
|
||||
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
|
||||
info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
|
||||
Ok(Some(seqnum))
|
||||
}
|
||||
|
||||
/// Await an upscale event, discarding any `memory.high` events received in
|
||||
/// the process.
|
||||
///
|
||||
/// This is used in `handle_memory_high_event`, where we need to listen
|
||||
/// for upscales in particular so we know if we can thaw the cgroup early.
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn await_upscale(
|
||||
&self,
|
||||
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
|
||||
) -> anyhow::Result<()> {
|
||||
let Sequenced { seqnum, .. } = upscales
|
||||
.recv()
|
||||
.await
|
||||
.context("error listening for upscales")?;
|
||||
|
||||
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the cgroup's name.
|
||||
pub fn path(&self) -> &str {
|
||||
self.cgroup.path()
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a set of limits we apply to a cgroup to control memory usage.
|
||||
///
|
||||
/// Setting these values also affects the thresholds for receiving usage alerts.
|
||||
#[derive(Debug)]
|
||||
pub struct MemoryLimits {
|
||||
high: u64,
|
||||
max: u64,
|
||||
}
|
||||
|
||||
impl MemoryLimits {
|
||||
pub fn new(high: u64, max: u64) -> Self {
|
||||
Self { max, high }
|
||||
}
|
||||
}
|
||||
|
||||
// Methods for manipulating the actual cgroup
|
||||
impl CgroupWatcher {
|
||||
/// Get a handle on the freezer subsystem.
|
||||
fn freezer(&self) -> anyhow::Result<&FreezerController> {
|
||||
if let Some(Freezer(freezer)) = self
|
||||
.cgroup
|
||||
.subsystems()
|
||||
.iter()
|
||||
.find(|sub| matches!(sub, Freezer(_)))
|
||||
{
|
||||
Ok(freezer)
|
||||
} else {
|
||||
anyhow::bail!("could not find freezer subsystem")
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt to freeze the cgroup.
|
||||
pub fn freeze(&self) -> anyhow::Result<()> {
|
||||
self.freezer()
|
||||
.context("failed to get freezer subsystem")?
|
||||
.freeze()
|
||||
.context("failed to freeze")
|
||||
}
|
||||
|
||||
/// Attempt to thaw the cgroup.
|
||||
pub fn thaw(&self) -> anyhow::Result<()> {
|
||||
self.freezer()
|
||||
.context("failed to get freezer subsystem")?
|
||||
.thaw()
|
||||
.context("failed to thaw")
|
||||
}
|
||||
|
||||
/// Get a handle on the memory subsystem.
|
||||
///
|
||||
/// Note: this method does not require `self.memory_update_lock` because
|
||||
/// getting a handle to the subsystem does not access any of the files we
|
||||
/// care about, such as memory.high and memory.events
|
||||
fn memory(&self) -> anyhow::Result<&MemController> {
|
||||
if let Some(Mem(memory)) = self
|
||||
.cgroup
|
||||
.subsystems()
|
||||
.iter()
|
||||
.find(|sub| matches!(sub, Mem(_)))
|
||||
{
|
||||
Ok(memory)
|
||||
} else {
|
||||
anyhow::bail!("could not find memory subsystem")
|
||||
}
|
||||
}
|
||||
|
||||
/// Get cgroup current memory usage.
|
||||
pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
|
||||
Ok(self
|
||||
.memory()
|
||||
.context("failed to get memory subsystem")?
|
||||
.memory_stat()
|
||||
.usage_in_bytes)
|
||||
}
|
||||
|
||||
/// Set cgroup memory.high threshold.
|
||||
pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
|
||||
self.memory()
|
||||
.context("failed to get memory subsystem")?
|
||||
.set_mem(cgroups_rs::memory::SetMemory {
|
||||
low: None,
|
||||
high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
|
||||
min: None,
|
||||
max: None,
|
||||
})
|
||||
.context("failed to set memory.high")
|
||||
}
|
||||
|
||||
/// Set cgroup memory.high and memory.max.
|
||||
pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
|
||||
info!(
|
||||
limits.high,
|
||||
limits.max,
|
||||
path = self.path(),
|
||||
"writing new memory limits",
|
||||
);
|
||||
self.memory()
|
||||
.context("failed to get memory subsystem while setting memory limits")?
|
||||
.set_mem(cgroups_rs::memory::SetMemory {
|
||||
min: None,
|
||||
low: None,
|
||||
high: Some(MaxValue::Value(
|
||||
u64::min(limits.high, i64::MAX as u64) as i64
|
||||
)),
|
||||
max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
|
||||
})
|
||||
.context("failed to set memory limits")
|
||||
}
|
||||
|
||||
/// Given some amount of available memory, set the desired cgroup memory limits
|
||||
pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
|
||||
let new_high = self.config.calculate_memory_high_value(available_memory);
|
||||
let limits = MemoryLimits::new(new_high, available_memory);
|
||||
info!(
|
||||
path = self.path(),
|
||||
memory = ?limits,
|
||||
"setting cgroup memory",
|
||||
);
|
||||
self.set_limits(&limits)
|
||||
.context("failed to set cgroup memory limits")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get memory.high threshold.
|
||||
pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
|
||||
let high = self
|
||||
.memory()
|
||||
.context("failed to get memory subsystem while getting memory statistics")?
|
||||
.get_mem()
|
||||
.map(|mem| mem.high)
|
||||
.context("failed to get memory statistics from subsystem")?;
|
||||
match high {
|
||||
Some(MaxValue::Max) => Ok(i64::MAX as u64),
|
||||
Some(MaxValue::Value(high)) => Ok(high as u64),
|
||||
None => anyhow::bail!("failed to read memory.high from memory subsystem"),
|
||||
}
|
||||
}
|
||||
}
|
||||
153
libs/vm_monitor/src/dispatcher.rs
Normal file
153
libs/vm_monitor/src/dispatcher.rs
Normal file
@@ -0,0 +1,153 @@
|
||||
//! Managing the websocket connection and other signals in the monitor.
|
||||
//!
|
||||
//! Contains types that manage the interaction (not data interchange, see `protocol`)
|
||||
//! between agent and monitor, allowing us to to process and send messages in a
|
||||
//! straightforward way. The dispatcher also manages that signals that come from
|
||||
//! the cgroup (requesting upscale), and the signals that go to the cgroup
|
||||
//! (notifying it of upscale).
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use axum::extract::ws::{Message, WebSocket};
|
||||
use futures::{
|
||||
stream::{SplitSink, SplitStream},
|
||||
SinkExt, StreamExt,
|
||||
};
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::info;
|
||||
|
||||
use crate::cgroup::Sequenced;
|
||||
use crate::protocol::{
|
||||
OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
|
||||
PROTOCOL_MIN_VERSION,
|
||||
};
|
||||
|
||||
/// The central handler for all communications in the monitor.
|
||||
///
|
||||
/// The dispatcher has two purposes:
|
||||
/// 1. Manage the connection to the agent, sending and receiving messages.
|
||||
/// 2. Communicate with the cgroup manager, notifying it when upscale is received,
|
||||
/// and sending a message to the agent when the cgroup manager requests
|
||||
/// upscale.
|
||||
#[derive(Debug)]
|
||||
pub struct Dispatcher {
|
||||
/// We read agent messages of of `source`
|
||||
pub(crate) source: SplitStream<WebSocket>,
|
||||
|
||||
/// We send messages to the agent through `sink`
|
||||
sink: SplitSink<WebSocket, Message>,
|
||||
|
||||
/// Used to notify the cgroup when we are upscaled.
|
||||
pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
|
||||
|
||||
/// When the cgroup requests upscale it will send on this channel. In response
|
||||
/// we send an `UpscaleRequst` to the agent.
|
||||
pub(crate) request_upscale_events: mpsc::Receiver<()>,
|
||||
|
||||
/// The protocol version we have agreed to use with the agent. This is negotiated
|
||||
/// during the creation of the dispatcher, and should be the highest shared protocol
|
||||
/// version.
|
||||
///
|
||||
// NOTE: currently unused, but will almost certainly be used in the futures
|
||||
// as the protocol changes
|
||||
#[allow(unused)]
|
||||
pub(crate) proto_version: ProtocolVersion,
|
||||
}
|
||||
|
||||
impl Dispatcher {
|
||||
/// Creates a new dispatcher using the passed-in connection.
|
||||
///
|
||||
/// Performs a negotiation with the agent to determine the highest protocol
|
||||
/// version that both support. This consists of two steps:
|
||||
/// 1. Wait for the agent to sent the range of protocols it supports.
|
||||
/// 2. Send a protocol version that works for us as well, or an error if there
|
||||
/// is no compatible version.
|
||||
pub async fn new(
|
||||
stream: WebSocket,
|
||||
notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
|
||||
request_upscale_events: mpsc::Receiver<()>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let (mut sink, mut source) = stream.split();
|
||||
|
||||
// Figure out the highest protocol version we both support
|
||||
info!("waiting for agent to send protocol version range");
|
||||
let Some(message) = source.next().await else {
|
||||
bail!("websocket connection closed while performing protocol handshake")
|
||||
};
|
||||
|
||||
let message = message.context("failed to read protocol version range off connection")?;
|
||||
|
||||
let Message::Text(message_text) = message else {
|
||||
// All messages should be in text form, since we don't do any
|
||||
// pinging/ponging. See nhooyr/websocket's implementation and the
|
||||
// agent for more info
|
||||
bail!("received non-text message during proocol handshake: {message:?}")
|
||||
};
|
||||
|
||||
let monitor_range = ProtocolRange {
|
||||
min: PROTOCOL_MIN_VERSION,
|
||||
max: PROTOCOL_MAX_VERSION,
|
||||
};
|
||||
|
||||
let agent_range: ProtocolRange = serde_json::from_str(&message_text)
|
||||
.context("failed to deserialize protocol version range")?;
|
||||
|
||||
info!(range = ?agent_range, "received protocol version range");
|
||||
|
||||
let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) {
|
||||
Ok(version) => {
|
||||
sink.send(Message::Text(
|
||||
serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
|
||||
))
|
||||
.await
|
||||
.context("failed to notify agent of negotiated protocol version")?;
|
||||
version
|
||||
}
|
||||
Err(e) => {
|
||||
sink.send(Message::Text(
|
||||
serde_json::to_string(&ProtocolResponse::Error(format!(
|
||||
"Received protocol version range {} which does not overlap with {}",
|
||||
agent_range, monitor_range
|
||||
)))
|
||||
.unwrap(),
|
||||
))
|
||||
.await
|
||||
.context("failed to notify agent of no overlap between protocol version ranges")?;
|
||||
Err(e).context("error determining suitable protocol version range")?
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
sink,
|
||||
source,
|
||||
notify_upscale_events,
|
||||
request_upscale_events,
|
||||
proto_version: highest_shared_version,
|
||||
})
|
||||
}
|
||||
|
||||
/// Notify the cgroup manager that we have received upscale and wait for
|
||||
/// the acknowledgement.
|
||||
#[tracing::instrument(skip_all, fields(?resources))]
|
||||
pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
|
||||
self.notify_upscale_events
|
||||
.send(resources)
|
||||
.await
|
||||
.context("failed to send resources and oneshot sender across channel")
|
||||
}
|
||||
|
||||
/// Send a message to the agent.
|
||||
///
|
||||
/// Although this function is small, it has one major benefit: it is the only
|
||||
/// way to send data accross the connection, and you can only pass in a proper
|
||||
/// `MonitorMessage`. Without safeguards like this, it's easy to accidentally
|
||||
/// serialize the wrong thing and send it, since `self.sink.send` will take
|
||||
/// any string.
|
||||
pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> {
|
||||
info!(?message, "sending message");
|
||||
let json = serde_json::to_string(&message).context("failed to serialize message")?;
|
||||
self.sink
|
||||
.send(Message::Text(json))
|
||||
.await
|
||||
.context("stream error sending message")
|
||||
}
|
||||
}
|
||||
306
libs/vm_monitor/src/filecache.rs
Normal file
306
libs/vm_monitor/src/filecache.rs
Normal file
@@ -0,0 +1,306 @@
|
||||
//! Logic for configuring and scaling the Postgres file cache.
|
||||
|
||||
use std::num::NonZeroU64;
|
||||
|
||||
use crate::MiB;
|
||||
use anyhow::{anyhow, Context};
|
||||
use tokio_postgres::{types::ToSql, Client, NoTls, Row};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info};
|
||||
|
||||
/// Manages Postgres' file cache by keeping a connection open.
|
||||
#[derive(Debug)]
|
||||
pub struct FileCacheState {
|
||||
client: Client,
|
||||
conn_str: String,
|
||||
pub(crate) config: FileCacheConfig,
|
||||
|
||||
/// A token for cancelling spawned threads during shutdown.
|
||||
token: CancellationToken,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FileCacheConfig {
|
||||
/// Whether the file cache is *actually* stored in memory (e.g. by writing to
|
||||
/// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
|
||||
/// memory available for the cgroup.
|
||||
pub(crate) in_memory: bool,
|
||||
|
||||
/// The size of the file cache, in terms of the size of the resource it consumes
|
||||
/// (currently: only memory)
|
||||
///
|
||||
/// For example, setting `resource_multipler = 0.75` gives the cache a target size of 75% of total
|
||||
/// resources.
|
||||
///
|
||||
/// This value must be strictly between 0 and 1.
|
||||
resource_multiplier: f64,
|
||||
|
||||
/// The required minimum amount of memory, in bytes, that must remain available
|
||||
/// after subtracting the file cache.
|
||||
///
|
||||
/// This value must be non-zero.
|
||||
min_remaining_after_cache: NonZeroU64,
|
||||
|
||||
/// Controls the rate of increase in the file cache's size as it grows from zero
|
||||
/// (when total resources equals min_remaining_after_cache) to the desired size based on
|
||||
/// `resource_multiplier`.
|
||||
///
|
||||
/// A `spread_factor` of zero means that all additional resources will go to the cache until it
|
||||
/// reaches the desired size. Setting `spread_factor` to N roughly means "for every 1 byte added to
|
||||
/// the cache's size, N bytes are reserved for the rest of the system, until the cache gets to
|
||||
/// its desired size".
|
||||
///
|
||||
/// This value must be >= 0, and must retain an increase that is more than what would be given by
|
||||
/// `resource_multiplier`. For example, setting `resource_multiplier` = 0.75 but `spread_factor` = 1
|
||||
/// would be invalid, because `spread_factor` would induce only 50% usage - never reaching the 75%
|
||||
/// as desired by `resource_multiplier`.
|
||||
///
|
||||
/// `spread_factor` is too large if `(spread_factor + 1) * resource_multiplier >= 1`.
|
||||
spread_factor: f64,
|
||||
}
|
||||
|
||||
impl Default for FileCacheConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
in_memory: true,
|
||||
// 75 %
|
||||
resource_multiplier: 0.75,
|
||||
// 640 MiB; (512 + 128)
|
||||
min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
|
||||
// ensure any increase in file cache size is split 90-10 with 10% to other memory
|
||||
spread_factor: 0.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FileCacheConfig {
|
||||
/// Make sure fields of the config are consistent.
|
||||
pub fn validate(&self) -> anyhow::Result<()> {
|
||||
// Single field validity
|
||||
anyhow::ensure!(
|
||||
0.0 < self.resource_multiplier && self.resource_multiplier < 1.0,
|
||||
"resource_multiplier must be between 0.0 and 1.0 exclusive, got {}",
|
||||
self.resource_multiplier
|
||||
);
|
||||
anyhow::ensure!(
|
||||
self.spread_factor >= 0.0,
|
||||
"spread_factor must be >= 0, got {}",
|
||||
self.spread_factor
|
||||
);
|
||||
|
||||
// Check that `resource_multiplier` and `spread_factor` are valid w.r.t. each other.
|
||||
//
|
||||
// As shown in `calculate_cache_size`, we have two lines resulting from `resource_multiplier` and
|
||||
// `spread_factor`, respectively. They are:
|
||||
//
|
||||
// `total` `min_remaining_after_cache`
|
||||
// size = ————————————————————— - —————————————————————————————
|
||||
// `spread_factor` + 1 `spread_factor` + 1
|
||||
//
|
||||
// and
|
||||
//
|
||||
// size = `resource_multiplier` × total
|
||||
//
|
||||
// .. where `total` is the total resources. These are isomorphic to the typical 'y = mx + b'
|
||||
// form, with y = "size" and x = "total".
|
||||
//
|
||||
// These lines intersect at:
|
||||
//
|
||||
// `min_remaining_after_cache`
|
||||
// ———————————————————————————————————————————————————
|
||||
// 1 - `resource_multiplier` × (`spread_factor` + 1)
|
||||
//
|
||||
// We want to ensure that this value (a) exists, and (b) is >= `min_remaining_after_cache`. This is
|
||||
// guaranteed when '`resource_multiplier` × (`spread_factor` + 1)' is less than 1.
|
||||
// (We also need it to be >= 0, but that's already guaranteed.)
|
||||
|
||||
let intersect_factor = self.resource_multiplier * (self.spread_factor + 1.0);
|
||||
anyhow::ensure!(
|
||||
intersect_factor < 1.0,
|
||||
"incompatible resource_multipler and spread_factor"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Calculate the desired size of the cache, given the total memory
|
||||
pub fn calculate_cache_size(&self, total: u64) -> u64 {
|
||||
// *Note*: all units are in bytes, until the very last line.
|
||||
let available = total.saturating_sub(self.min_remaining_after_cache.get());
|
||||
if available == 0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Conversions to ensure we don't overflow from floating-point ops
|
||||
let size_from_spread =
|
||||
i64::max(0, (available as f64 / (1.0 + self.spread_factor)) as i64) as u64;
|
||||
|
||||
let size_from_normal = (total as f64 * self.resource_multiplier) as u64;
|
||||
|
||||
let byte_size = u64::min(size_from_spread, size_from_normal);
|
||||
|
||||
// The file cache operates in units of mebibytes, so the sizes we produce should
|
||||
// be rounded to a mebibyte. We round down to be conservative.
|
||||
byte_size / MiB * MiB
|
||||
}
|
||||
}
|
||||
|
||||
impl FileCacheState {
|
||||
/// Connect to the file cache.
|
||||
#[tracing::instrument(skip_all, fields(%conn_str, ?config))]
|
||||
pub async fn new(
|
||||
conn_str: &str,
|
||||
config: FileCacheConfig,
|
||||
token: CancellationToken,
|
||||
) -> anyhow::Result<Self> {
|
||||
config.validate().context("file cache config is invalid")?;
|
||||
|
||||
info!(conn_str, "connecting to Postgres file cache");
|
||||
let client = FileCacheState::connect(conn_str, token.clone())
|
||||
.await
|
||||
.context("failed to connect to postgres file cache")?;
|
||||
|
||||
let conn_str = conn_str.to_string();
|
||||
Ok(Self {
|
||||
client,
|
||||
config,
|
||||
conn_str,
|
||||
token,
|
||||
})
|
||||
}
|
||||
|
||||
/// Connect to Postgres.
|
||||
///
|
||||
/// Aborts the spawned thread if the kill signal is received. This is not
|
||||
/// a method as it is called in [`FileCacheState::new`].
|
||||
#[tracing::instrument(skip_all, fields(%conn_str))]
|
||||
async fn connect(conn_str: &str, token: CancellationToken) -> anyhow::Result<Client> {
|
||||
let (client, conn) = tokio_postgres::connect(conn_str, NoTls)
|
||||
.await
|
||||
.context("failed to connect to pg client")?;
|
||||
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own. See tokio-postgres docs.
|
||||
crate::spawn_with_cancel(
|
||||
token,
|
||||
|res| {
|
||||
if let Err(error) = res {
|
||||
error!(%error, "postgres error")
|
||||
}
|
||||
},
|
||||
conn,
|
||||
);
|
||||
|
||||
Ok(client)
|
||||
}
|
||||
|
||||
/// Execute a query with a retry if necessary.
|
||||
///
|
||||
/// If the initial query fails, we restart the database connection and attempt
|
||||
/// if again.
|
||||
#[tracing::instrument(skip_all, fields(%statement))]
|
||||
pub async fn query_with_retry(
|
||||
&mut self,
|
||||
statement: &str,
|
||||
params: &[&(dyn ToSql + Sync)],
|
||||
) -> anyhow::Result<Vec<Row>> {
|
||||
match self
|
||||
.client
|
||||
.query(statement, params)
|
||||
.await
|
||||
.context("failed to execute query")
|
||||
{
|
||||
Ok(rows) => Ok(rows),
|
||||
Err(e) => {
|
||||
error!(error = ?e, "postgres error: {e} -> retrying");
|
||||
|
||||
let client = FileCacheState::connect(&self.conn_str, self.token.clone())
|
||||
.await
|
||||
.context("failed to connect to postgres file cache")?;
|
||||
info!("successfully reconnected to postgres client");
|
||||
|
||||
// Replace the old client and attempt the query with the new one
|
||||
self.client = client;
|
||||
self.client
|
||||
.query(statement, params)
|
||||
.await
|
||||
.context("failed to execute query a second time")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the current size of the file cache.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn get_file_cache_size(&mut self) -> anyhow::Result<u64> {
|
||||
self.query_with_retry(
|
||||
// The file cache GUC variable is in MiB, but the conversion with
|
||||
// pg_size_bytes means that the end result we get is in bytes.
|
||||
"SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit'));",
|
||||
&[],
|
||||
)
|
||||
.await
|
||||
.context("failed to query pg for file cache size")?
|
||||
.first()
|
||||
.ok_or_else(|| anyhow!("file cache size query returned no rows"))?
|
||||
// pg_size_bytes returns a bigint which is the same as an i64.
|
||||
.try_get::<_, i64>(0)
|
||||
// Since the size of the table is not negative, the cast is sound.
|
||||
.map(|bytes| bytes as u64)
|
||||
.context("failed to extract file cache size from query result")
|
||||
}
|
||||
|
||||
/// Attempt to set the file cache size, returning the size it was actually
|
||||
/// set to.
|
||||
#[tracing::instrument(skip_all, fields(%num_bytes))]
|
||||
pub async fn set_file_cache_size(&mut self, num_bytes: u64) -> anyhow::Result<u64> {
|
||||
let max_bytes = self
|
||||
// The file cache GUC variable is in MiB, but the conversion with pg_size_bytes
|
||||
// means that the end result we get is in bytes.
|
||||
.query_with_retry(
|
||||
"SELECT pg_size_bytes(current_setting('neon.max_file_cache_size'));",
|
||||
&[],
|
||||
)
|
||||
.await
|
||||
.context("failed to query pg for max file cache size")?
|
||||
.first()
|
||||
.ok_or_else(|| anyhow!("max file cache size query returned no rows"))?
|
||||
.try_get::<_, i64>(0)
|
||||
.map(|bytes| bytes as u64)
|
||||
.context("failed to extract max file cache size from query result")?;
|
||||
|
||||
let max_mb = max_bytes / MiB;
|
||||
let num_mb = u64::min(num_bytes, max_bytes) / MiB;
|
||||
|
||||
let capped = if num_bytes > max_bytes {
|
||||
" (capped by maximum size)"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
info!(
|
||||
size = num_mb,
|
||||
max = max_mb,
|
||||
"updating file cache size {capped}",
|
||||
);
|
||||
|
||||
// note: even though the normal ways to get the cache size produce values with trailing "MB"
|
||||
// (hence why we call pg_size_bytes in `get_file_cache_size`'s query), the format
|
||||
// it expects to set the value is "integer number of MB" without trailing units.
|
||||
// For some reason, this *really* wasn't working with normal arguments, so that's
|
||||
// why we're constructing the query here.
|
||||
self.client
|
||||
.query(
|
||||
&format!("ALTER SYSTEM SET neon.file_cache_size_limit = {};", num_mb),
|
||||
&[],
|
||||
)
|
||||
.await
|
||||
.context("failed to change file cache size limit")?;
|
||||
|
||||
// must use pg_reload_conf to have the settings change take effect
|
||||
self.client
|
||||
.execute("SELECT pg_reload_conf();", &[])
|
||||
.await
|
||||
.context("failed to reload config")?;
|
||||
|
||||
Ok(num_mb * MiB)
|
||||
}
|
||||
}
|
||||
205
libs/vm_monitor/src/lib.rs
Normal file
205
libs/vm_monitor/src/lib.rs
Normal file
@@ -0,0 +1,205 @@
|
||||
#![cfg(target_os = "linux")]
|
||||
|
||||
use anyhow::Context;
|
||||
use axum::{
|
||||
extract::{ws::WebSocket, State, WebSocketUpgrade},
|
||||
response::Response,
|
||||
};
|
||||
use axum::{routing::get, Router, Server};
|
||||
use clap::Parser;
|
||||
use futures::Future;
|
||||
use std::{fmt::Debug, time::Duration};
|
||||
use sysinfo::{RefreshKind, System, SystemExt};
|
||||
use tokio::{sync::broadcast, task::JoinHandle};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info};
|
||||
|
||||
use runner::Runner;
|
||||
|
||||
// Code that interfaces with agent
|
||||
pub mod dispatcher;
|
||||
pub mod protocol;
|
||||
|
||||
pub mod cgroup;
|
||||
pub mod filecache;
|
||||
pub mod runner;
|
||||
|
||||
/// The vm-monitor is an autoscaling component started by compute_ctl.
|
||||
///
|
||||
/// It carries out autoscaling decisions (upscaling/downscaling) and responds to
|
||||
/// memory pressure by making requests to the autoscaler-agent.
|
||||
#[derive(Debug, Parser)]
|
||||
pub struct Args {
|
||||
/// The name of the cgroup we should monitor for memory.high events. This
|
||||
/// is the cgroup that postgres should be running in.
|
||||
#[arg(short, long)]
|
||||
pub cgroup: Option<String>,
|
||||
|
||||
/// The connection string for the Postgres file cache we should manage.
|
||||
#[arg(short, long)]
|
||||
pub pgconnstr: Option<String>,
|
||||
|
||||
/// The address we should listen on for connection requests. For the
|
||||
/// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
|
||||
#[arg(short, long)]
|
||||
pub addr: String,
|
||||
}
|
||||
|
||||
impl Args {
|
||||
pub fn addr(&self) -> &str {
|
||||
&self.addr
|
||||
}
|
||||
}
|
||||
|
||||
/// The number of bytes in one mebibyte.
|
||||
#[allow(non_upper_case_globals)]
|
||||
const MiB: u64 = 1 << 20;
|
||||
|
||||
/// Convert a quantity in bytes to a quantity in mebibytes, generally for display
|
||||
/// purposes. (Most calculations in this crate use bytes directly)
|
||||
pub fn bytes_to_mebibytes(bytes: u64) -> f32 {
|
||||
(bytes as f32) / (MiB as f32)
|
||||
}
|
||||
|
||||
pub fn get_total_system_memory() -> u64 {
|
||||
System::new_with_specifics(RefreshKind::new().with_memory()).total_memory()
|
||||
}
|
||||
|
||||
/// Global app state for the Axum server
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ServerState {
|
||||
/// Used to close old connections.
|
||||
///
|
||||
/// When a new connection is made, we send a message signalling to the old
|
||||
/// connection to close.
|
||||
pub sender: broadcast::Sender<()>,
|
||||
|
||||
/// Used to cancel all spawned threads in the monitor.
|
||||
pub token: CancellationToken,
|
||||
|
||||
// The CLI args
|
||||
pub args: &'static Args,
|
||||
}
|
||||
|
||||
/// Spawn a thread that may get cancelled by the provided [`CancellationToken`].
|
||||
///
|
||||
/// This is mainly meant to be called with futures that will be pending for a very
|
||||
/// long time, or are not mean to return. If it is not desirable for the future to
|
||||
/// ever resolve, such as in the case of [`cgroup::CgroupWatcher::watch`], the error can
|
||||
/// be logged with `f`.
|
||||
pub fn spawn_with_cancel<T, F>(
|
||||
token: CancellationToken,
|
||||
f: F,
|
||||
future: T,
|
||||
) -> JoinHandle<Option<T::Output>>
|
||||
where
|
||||
T: Future + Send + 'static,
|
||||
T::Output: Send + 'static,
|
||||
F: FnOnce(&T::Output) + Send + 'static,
|
||||
{
|
||||
tokio::spawn(async move {
|
||||
tokio::select! {
|
||||
_ = token.cancelled() => {
|
||||
info!("received global kill signal");
|
||||
None
|
||||
}
|
||||
res = future => {
|
||||
f(&res);
|
||||
Some(res)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// The entrypoint to the binary.
|
||||
///
|
||||
/// Set up tracing, parse arguments, and start an http server.
|
||||
pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Result<()> {
|
||||
// This channel is used to close old connections. When a new connection is
|
||||
// made, we send a message signalling to the old connection to close.
|
||||
let (sender, _) = tokio::sync::broadcast::channel::<()>(1);
|
||||
|
||||
let app = Router::new()
|
||||
// This route gets upgraded to a websocket connection. We only support
|
||||
// one connection at a time, which we enforce by killing old connections
|
||||
// when we receive a new one.
|
||||
.route("/monitor", get(ws_handler))
|
||||
.with_state(ServerState {
|
||||
sender,
|
||||
token,
|
||||
args,
|
||||
});
|
||||
|
||||
let addr = args.addr();
|
||||
let bound = Server::try_bind(&addr.parse().expect("parsing address should not fail"))
|
||||
.with_context(|| format!("failed to bind to {addr}"))?;
|
||||
|
||||
info!(addr, "server bound");
|
||||
|
||||
bound
|
||||
.serve(app.into_make_service())
|
||||
.await
|
||||
.context("server exited")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handles incoming websocket connections.
|
||||
///
|
||||
/// If we are already to connected to an agent, we kill that old connection
|
||||
/// and accept the new one.
|
||||
#[tracing::instrument(name = "/monitor", skip_all, fields(?args))]
|
||||
pub async fn ws_handler(
|
||||
ws: WebSocketUpgrade,
|
||||
State(ServerState {
|
||||
sender,
|
||||
token,
|
||||
args,
|
||||
}): State<ServerState>,
|
||||
) -> Response {
|
||||
// Kill the old monitor
|
||||
info!("closing old connection if there is one");
|
||||
let _ = sender.send(());
|
||||
|
||||
// Start the new one. Wow, the cycle of death and rebirth
|
||||
let closer = sender.subscribe();
|
||||
ws.on_upgrade(|ws| start_monitor(ws, args, closer, token))
|
||||
}
|
||||
|
||||
/// Starts the monitor. If startup fails or the monitor exits, an error will
|
||||
/// be logged and our internal state will be reset to allow for new connections.
|
||||
#[tracing::instrument(skip_all, fields(?args))]
|
||||
async fn start_monitor(
|
||||
ws: WebSocket,
|
||||
args: &Args,
|
||||
kill: broadcast::Receiver<()>,
|
||||
token: CancellationToken,
|
||||
) {
|
||||
info!("accepted new websocket connection -> starting monitor");
|
||||
let timeout = Duration::from_secs(4);
|
||||
let monitor = tokio::time::timeout(
|
||||
timeout,
|
||||
Runner::new(Default::default(), args, ws, kill, token),
|
||||
)
|
||||
.await;
|
||||
let mut monitor = match monitor {
|
||||
Ok(Ok(monitor)) => monitor,
|
||||
Ok(Err(error)) => {
|
||||
error!(?error, "failed to create monitor");
|
||||
return;
|
||||
}
|
||||
Err(_) => {
|
||||
error!(
|
||||
?timeout,
|
||||
"creating monitor timed out (probably waiting to receive protocol range)"
|
||||
);
|
||||
return;
|
||||
}
|
||||
};
|
||||
info!("connected to agent");
|
||||
|
||||
match monitor.run().await {
|
||||
Ok(()) => info!("monitor was killed due to new connection"),
|
||||
Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
|
||||
}
|
||||
}
|
||||
241
libs/vm_monitor/src/protocol.rs
Normal file
241
libs/vm_monitor/src/protocol.rs
Normal file
@@ -0,0 +1,241 @@
|
||||
//! Types representing protocols and actual agent-monitor messages.
|
||||
//!
|
||||
//! The pervasive use of serde modifiers throughout this module is to ease
|
||||
//! serialization on the go side. Because go does not have enums (which model
|
||||
//! messages well), it is harder to model messages, and we accomodate that with
|
||||
//! serde.
|
||||
//!
|
||||
//! *Note*: the agent sends and receives messages in different ways.
|
||||
//!
|
||||
//! The agent serializes messages in the form and then sends them. The use
|
||||
//! of `#[serde(tag = "type", content = "content")]` allows us to use `Type`
|
||||
//! to determine how to deserialize `Content`.
|
||||
//! ```ignore
|
||||
//! struct {
|
||||
//! Content any
|
||||
//! Type string
|
||||
//! Id uint64
|
||||
//! }
|
||||
//! ```
|
||||
//! and receives messages in the form:
|
||||
//! ```ignore
|
||||
//! struct {
|
||||
//! {fields embedded}
|
||||
//! Type string
|
||||
//! Id uint64
|
||||
//! }
|
||||
//! ```
|
||||
//! After reading the type field, the agent will decode the entire message
|
||||
//! again, this time into the correct type using the embedded fields.
|
||||
//! Because the agent cannot just extract the json contained in a certain field
|
||||
//! (it initially deserializes to `map[string]interface{}`), we keep the fields
|
||||
//! at the top level, so the entire piece of json can be deserialized into a struct,
|
||||
//! such as a `DownscaleResult`, with the `Type` and `Id` fields ignored.
|
||||
|
||||
use core::fmt;
|
||||
use std::cmp;
|
||||
|
||||
use serde::{de::Error, Deserialize, Serialize};
|
||||
|
||||
/// A Message we send to the agent.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct OutboundMsg {
|
||||
#[serde(flatten)]
|
||||
pub(crate) inner: OutboundMsgKind,
|
||||
pub(crate) id: usize,
|
||||
}
|
||||
|
||||
impl OutboundMsg {
|
||||
pub fn new(inner: OutboundMsgKind, id: usize) -> Self {
|
||||
Self { inner, id }
|
||||
}
|
||||
}
|
||||
|
||||
/// The different underlying message types we can send to the agent.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
#[serde(tag = "type")]
|
||||
pub enum OutboundMsgKind {
|
||||
/// Indicates that the agent sent an invalid message, i.e, we couldn't
|
||||
/// properly deserialize it.
|
||||
InvalidMessage { error: String },
|
||||
/// Indicates that we experienced an internal error while processing a message.
|
||||
/// For example, if a cgroup operation fails while trying to handle an upscale,
|
||||
/// we return `InternalError`.
|
||||
InternalError { error: String },
|
||||
/// Returned to the agent once we have finished handling an upscale. If the
|
||||
/// handling was unsuccessful, an `InternalError` will get returned instead.
|
||||
/// *Note*: this is a struct variant because of the way go serializes struct{}
|
||||
UpscaleConfirmation {},
|
||||
/// Indicates to the monitor that we are urgently requesting resources.
|
||||
/// *Note*: this is a struct variant because of the way go serializes struct{}
|
||||
UpscaleRequest {},
|
||||
/// Returned to the agent once we have finished attempting to downscale. If
|
||||
/// an error occured trying to do so, an `InternalError` will get returned instead.
|
||||
/// However, if we are simply unsuccessful (for example, do to needing the resources),
|
||||
/// that gets included in the `DownscaleResult`.
|
||||
DownscaleResult {
|
||||
// FIXME for the future (once the informant is deprecated)
|
||||
// As of the time of writing, the agent/informant version of this struct is
|
||||
// called api.DownscaleResult. This struct has uppercase fields which are
|
||||
// serialized as such. Thus, we serialize using uppercase names so we don't
|
||||
// have to make a breaking change to the agent<->informant protocol. Once
|
||||
// the informant has been superseded by the monitor, we can add the correct
|
||||
// struct tags to api.DownscaleResult without causing a breaking change,
|
||||
// since we don't need to support the agent<->informant protocol anymore.
|
||||
#[serde(rename = "Ok")]
|
||||
ok: bool,
|
||||
#[serde(rename = "Status")]
|
||||
status: String,
|
||||
},
|
||||
/// Part of the bidirectional heartbeat. The heartbeat is initiated by the
|
||||
/// agent.
|
||||
/// *Note*: this is a struct variant because of the way go serializes struct{}
|
||||
HealthCheck {},
|
||||
}
|
||||
|
||||
/// A message received form the agent.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct InboundMsg {
|
||||
#[serde(flatten)]
|
||||
pub(crate) inner: InboundMsgKind,
|
||||
pub(crate) id: usize,
|
||||
}
|
||||
|
||||
/// The different underlying message types we can receive from the agent.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
#[serde(tag = "type", content = "content")]
|
||||
pub enum InboundMsgKind {
|
||||
/// Indicates that the we sent an invalid message, i.e, we couldn't
|
||||
/// properly deserialize it.
|
||||
InvalidMessage { error: String },
|
||||
/// Indicates that the informan experienced an internal error while processing
|
||||
/// a message. For example, if it failed to request upsacle from the agent, it
|
||||
/// would return an `InternalError`.
|
||||
InternalError { error: String },
|
||||
/// Indicates to us that we have been granted more resources. We should respond
|
||||
/// with an `UpscaleConfirmation` when done handling the resources (increasins
|
||||
/// file cache size, cgorup memory limits).
|
||||
UpscaleNotification { granted: Resources },
|
||||
/// A request to reduce resource usage. We should response with a `DownscaleResult`,
|
||||
/// when done.
|
||||
DownscaleRequest { target: Resources },
|
||||
/// Part of the bidirectional heartbeat. The heartbeat is initiated by the
|
||||
/// agent.
|
||||
/// *Note*: this is a struct variant because of the way go serializes struct{}
|
||||
HealthCheck {},
|
||||
}
|
||||
|
||||
/// Represents the resources granted to a VM.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Copy)]
|
||||
// Renamed because the agent has multiple resources types:
|
||||
// `Resources` (milliCPU/memory slots)
|
||||
// `Allocation` (vCPU/bytes) <- what we correspond to
|
||||
#[serde(rename(serialize = "Allocation", deserialize = "Allocation"))]
|
||||
pub struct Resources {
|
||||
/// Number of vCPUs
|
||||
pub(crate) cpu: f64,
|
||||
/// Bytes of memory
|
||||
pub(crate) mem: u64,
|
||||
}
|
||||
|
||||
impl Resources {
|
||||
pub fn new(cpu: f64, mem: u64) -> Self {
|
||||
Self { cpu, mem }
|
||||
}
|
||||
}
|
||||
|
||||
pub const PROTOCOL_MIN_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
|
||||
pub const PROTOCOL_MAX_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Ord, Eq, Serialize, Deserialize)]
|
||||
pub struct ProtocolVersion(u8);
|
||||
|
||||
impl ProtocolVersion {
|
||||
/// Represents v1.0 of the agent<-> monitor protocol - the initial version
|
||||
///
|
||||
/// Currently the latest version.
|
||||
const V1_0: ProtocolVersion = ProtocolVersion(1);
|
||||
}
|
||||
|
||||
impl fmt::Display for ProtocolVersion {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match *self {
|
||||
ProtocolVersion(0) => f.write_str("<invalid: zero>"),
|
||||
ProtocolVersion::V1_0 => f.write_str("v1.0"),
|
||||
other => write!(f, "<unknown: {other}>"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of protocol bounds that determines what we are speaking.
|
||||
///
|
||||
/// These bounds are inclusive.
|
||||
#[derive(Debug)]
|
||||
pub struct ProtocolRange {
|
||||
pub min: ProtocolVersion,
|
||||
pub max: ProtocolVersion,
|
||||
}
|
||||
|
||||
// Use a custom deserialize impl to ensure that `self.min <= self.max`
|
||||
impl<'de> Deserialize<'de> for ProtocolRange {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
#[derive(Deserialize)]
|
||||
struct InnerProtocolRange {
|
||||
min: ProtocolVersion,
|
||||
max: ProtocolVersion,
|
||||
}
|
||||
let InnerProtocolRange { min, max } = InnerProtocolRange::deserialize(deserializer)?;
|
||||
if min > max {
|
||||
Err(D::Error::custom(format!(
|
||||
"min version = {min} is greater than max version = {max}",
|
||||
)))
|
||||
} else {
|
||||
Ok(ProtocolRange { min, max })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ProtocolRange {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if self.min == self.max {
|
||||
f.write_fmt(format_args!("{}", self.max))
|
||||
} else {
|
||||
f.write_fmt(format_args!("{} to {}", self.min, self.max))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ProtocolRange {
|
||||
/// Find the highest shared version between two `ProtocolRange`'s
|
||||
pub fn highest_shared_version(&self, other: &Self) -> anyhow::Result<ProtocolVersion> {
|
||||
// We first have to make sure the ranges are overlapping. Once we know
|
||||
// this, we can merge the ranges by taking the max of the mins and the
|
||||
// mins of the maxes.
|
||||
if self.min > other.max {
|
||||
anyhow::bail!(
|
||||
"Non-overlapping bounds: other.max = {} was less than self.min = {}",
|
||||
other.max,
|
||||
self.min,
|
||||
)
|
||||
} else if self.max < other.min {
|
||||
anyhow::bail!(
|
||||
"Non-overlappinng bounds: self.max = {} was less than other.min = {}",
|
||||
self.max,
|
||||
other.min
|
||||
)
|
||||
} else {
|
||||
Ok(cmp::min(self.max, other.max))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// We send this to the monitor after negotiating which protocol to use
|
||||
#[derive(Serialize, Debug)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum ProtocolResponse {
|
||||
Error(String),
|
||||
Version(ProtocolVersion),
|
||||
}
|
||||
460
libs/vm_monitor/src/runner.rs
Normal file
460
libs/vm_monitor/src/runner.rs
Normal file
@@ -0,0 +1,460 @@
|
||||
//! Exposes the `Runner`, which handles messages received from agent and
|
||||
//! sends upscale requests.
|
||||
//!
|
||||
//! This is the "Monitor" part of the monitor binary and is the main entrypoint for
|
||||
//! all functionality.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::{fmt::Debug, mem};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use axum::extract::ws::{Message, WebSocket};
|
||||
use futures::StreamExt;
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
|
||||
use crate::dispatcher::Dispatcher;
|
||||
use crate::filecache::{FileCacheConfig, FileCacheState};
|
||||
use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
|
||||
use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB};
|
||||
|
||||
/// Central struct that interacts with agent, dispatcher, and cgroup to handle
|
||||
/// signals from the agent.
|
||||
#[derive(Debug)]
|
||||
pub struct Runner {
|
||||
config: Config,
|
||||
filecache: Option<FileCacheState>,
|
||||
cgroup: Option<Arc<CgroupWatcher>>,
|
||||
dispatcher: Dispatcher,
|
||||
|
||||
/// We "mint" new message ids by incrementing this counter and taking the value.
|
||||
///
|
||||
/// **Note**: This counter is always odd, so that we avoid collisions between the IDs generated
|
||||
/// by us vs the autoscaler-agent.
|
||||
counter: usize,
|
||||
|
||||
/// A signal to kill the main thread produced by `self.run()`. This is triggered
|
||||
/// when the server receives a new connection. When the thread receives the
|
||||
/// signal off this channel, it will gracefully shutdown.
|
||||
kill: broadcast::Receiver<()>,
|
||||
}
|
||||
|
||||
/// Configuration for a `Runner`
|
||||
#[derive(Debug)]
|
||||
pub struct Config {
|
||||
/// `sys_buffer_bytes` gives the estimated amount of memory, in bytes, that the kernel uses before
|
||||
/// handing out the rest to userspace. This value is the estimated difference between the
|
||||
/// *actual* physical memory and the amount reported by `grep MemTotal /proc/meminfo`.
|
||||
///
|
||||
/// For more information, refer to `man 5 proc`, which defines MemTotal as "Total usable RAM
|
||||
/// (i.e., physical RAM minus a few reserved bits and the kernel binary code)".
|
||||
///
|
||||
/// We only use `sys_buffer_bytes` when calculating the system memory from the *external* memory
|
||||
/// size, rather than the self-reported memory size, according to the kernel.
|
||||
///
|
||||
/// TODO: this field is only necessary while we still have to trust the autoscaler-agent's
|
||||
/// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
|
||||
/// should be removed once we have a better solution there.
|
||||
sys_buffer_bytes: u64,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sys_buffer_bytes: 100 * MiB,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Runner {
|
||||
/// Create a new monitor.
|
||||
#[tracing::instrument(skip_all, fields(?config, ?args))]
|
||||
pub async fn new(
|
||||
config: Config,
|
||||
args: &Args,
|
||||
ws: WebSocket,
|
||||
kill: broadcast::Receiver<()>,
|
||||
token: CancellationToken,
|
||||
) -> anyhow::Result<Runner> {
|
||||
anyhow::ensure!(
|
||||
config.sys_buffer_bytes != 0,
|
||||
"invalid monitor Config: sys_buffer_bytes cannot be 0"
|
||||
);
|
||||
|
||||
// *NOTE*: the dispatcher and cgroup manager talk through these channels
|
||||
// so make sure they each get the correct half, nothing is droppped, etc.
|
||||
let (notified_send, notified_recv) = mpsc::channel(1);
|
||||
let (requesting_send, requesting_recv) = mpsc::channel(1);
|
||||
|
||||
let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
|
||||
.await
|
||||
.context("error creating new dispatcher")?;
|
||||
|
||||
let mut state = Runner {
|
||||
config,
|
||||
filecache: None,
|
||||
cgroup: None,
|
||||
dispatcher,
|
||||
counter: 1, // NB: must be odd, see the comment about the field for more.
|
||||
kill,
|
||||
};
|
||||
|
||||
let mut file_cache_reserved_bytes = 0;
|
||||
let mem = get_total_system_memory();
|
||||
|
||||
// We need to process file cache initialization before cgroup initialization, so that the memory
|
||||
// allocated to the file cache is appropriately taken into account when we decide the cgroup's
|
||||
// memory limits.
|
||||
if let Some(connstr) = &args.pgconnstr {
|
||||
info!("initializing file cache");
|
||||
let config: FileCacheConfig = Default::default();
|
||||
if !config.in_memory {
|
||||
panic!("file cache not in-memory implemented")
|
||||
}
|
||||
|
||||
let mut file_cache = FileCacheState::new(connstr, config, token.clone())
|
||||
.await
|
||||
.context("failed to create file cache")?;
|
||||
|
||||
let size = file_cache
|
||||
.get_file_cache_size()
|
||||
.await
|
||||
.context("error getting file cache size")?;
|
||||
|
||||
let new_size = file_cache.config.calculate_cache_size(mem);
|
||||
info!(
|
||||
initial = bytes_to_mebibytes(size),
|
||||
new = bytes_to_mebibytes(new_size),
|
||||
"setting initial file cache size",
|
||||
);
|
||||
|
||||
// note: even if size == new_size, we want to explicitly set it, just
|
||||
// to make sure that we have the permissions to do so
|
||||
let actual_size = file_cache
|
||||
.set_file_cache_size(new_size)
|
||||
.await
|
||||
.context("failed to set file cache size, possibly due to inadequate permissions")?;
|
||||
if actual_size != new_size {
|
||||
info!("file cache size actually got set to {actual_size}")
|
||||
}
|
||||
file_cache_reserved_bytes = actual_size;
|
||||
|
||||
state.filecache = Some(file_cache);
|
||||
}
|
||||
|
||||
if let Some(name) = &args.cgroup {
|
||||
let (mut cgroup, cgroup_event_stream) =
|
||||
CgroupWatcher::new(name.clone(), requesting_send)
|
||||
.context("failed to create cgroup manager")?;
|
||||
|
||||
let available = mem - file_cache_reserved_bytes;
|
||||
|
||||
cgroup
|
||||
.set_memory_limits(available)
|
||||
.context("failed to set cgroup memory limits")?;
|
||||
|
||||
let cgroup = Arc::new(cgroup);
|
||||
|
||||
// Some might call this . . . cgroup v2
|
||||
let cgroup_clone = Arc::clone(&cgroup);
|
||||
|
||||
spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
|
||||
cgroup_clone.watch(notified_recv, cgroup_event_stream).await
|
||||
});
|
||||
|
||||
state.cgroup = Some(cgroup);
|
||||
} else {
|
||||
// *NOTE*: We need to forget the sender so that its drop impl does not get ran.
|
||||
// This allows us to poll it in `Monitor::run` regardless of whether we
|
||||
// are managing a cgroup or not. If we don't forget it, all receives will
|
||||
// immediately return an error because the sender is droped and it will
|
||||
// claim all select! statements, effectively turning `Monitor::run` into
|
||||
// `loop { fail to receive }`.
|
||||
mem::forget(requesting_send);
|
||||
}
|
||||
|
||||
Ok(state)
|
||||
}
|
||||
|
||||
/// Attempt to downscale filecache + cgroup
|
||||
#[tracing::instrument(skip_all, fields(?target))]
|
||||
pub async fn try_downscale(&mut self, target: Resources) -> anyhow::Result<(bool, String)> {
|
||||
// Nothing to adjust
|
||||
if self.cgroup.is_none() && self.filecache.is_none() {
|
||||
info!("no action needed for downscale (no cgroup or file cache enabled)");
|
||||
return Ok((
|
||||
true,
|
||||
"monitor is not managing cgroup or file cache".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let requested_mem = target.mem;
|
||||
let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
|
||||
let expected_file_cache_mem_usage = self
|
||||
.filecache
|
||||
.as_ref()
|
||||
.map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
|
||||
.unwrap_or(0);
|
||||
let mut new_cgroup_mem_high = 0;
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
new_cgroup_mem_high = cgroup
|
||||
.config
|
||||
.calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
|
||||
|
||||
let current = cgroup
|
||||
.current_memory_usage()
|
||||
.context("failed to fetch cgroup memory")?;
|
||||
|
||||
if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
|
||||
let status = format!(
|
||||
"{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
|
||||
"calculated memory.high too low",
|
||||
bytes_to_mebibytes(new_cgroup_mem_high),
|
||||
bytes_to_mebibytes(current),
|
||||
bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
|
||||
);
|
||||
|
||||
info!(status, "discontinuing downscale");
|
||||
|
||||
return Ok((false, status));
|
||||
}
|
||||
}
|
||||
|
||||
// The downscaling has been approved. Downscale the file cache, then the cgroup.
|
||||
let mut status = vec![];
|
||||
let mut file_cache_mem_usage = 0;
|
||||
if let Some(file_cache) = &mut self.filecache {
|
||||
if !file_cache.config.in_memory {
|
||||
panic!("file cache not in-memory unimplemented")
|
||||
}
|
||||
|
||||
let actual_usage = file_cache
|
||||
.set_file_cache_size(expected_file_cache_mem_usage)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
file_cache_mem_usage = actual_usage;
|
||||
let message = format!(
|
||||
"set file cache size to {} MiB",
|
||||
bytes_to_mebibytes(actual_usage)
|
||||
);
|
||||
info!("downscale: {message}");
|
||||
status.push(message);
|
||||
}
|
||||
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let available_memory = usable_system_memory - file_cache_mem_usage;
|
||||
|
||||
if file_cache_mem_usage != expected_file_cache_mem_usage {
|
||||
new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
|
||||
}
|
||||
|
||||
let limits = MemoryLimits::new(
|
||||
// new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
|
||||
// since it is properly initialized in the previous cgroup if let block
|
||||
new_cgroup_mem_high,
|
||||
available_memory,
|
||||
);
|
||||
cgroup
|
||||
.set_limits(&limits)
|
||||
.context("failed to set cgroup memory limits")?;
|
||||
|
||||
let message = format!(
|
||||
"set cgroup memory.high to {} MiB, of new max {} MiB",
|
||||
bytes_to_mebibytes(new_cgroup_mem_high),
|
||||
bytes_to_mebibytes(available_memory)
|
||||
);
|
||||
info!("downscale: {message}");
|
||||
status.push(message);
|
||||
}
|
||||
|
||||
// TODO: make this status thing less jank
|
||||
let status = status.join("; ");
|
||||
Ok((true, status))
|
||||
}
|
||||
|
||||
/// Handle new resources
|
||||
#[tracing::instrument(skip_all, fields(?resources))]
|
||||
pub async fn handle_upscale(&mut self, resources: Resources) -> anyhow::Result<()> {
|
||||
if self.filecache.is_none() && self.cgroup.is_none() {
|
||||
info!("no action needed for upscale (no cgroup or file cache enabled)");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let new_mem = resources.mem;
|
||||
let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);
|
||||
|
||||
// Get the file cache's expected contribution to the memory usage
|
||||
let mut file_cache_mem_usage = 0;
|
||||
if let Some(file_cache) = &mut self.filecache {
|
||||
if !file_cache.config.in_memory {
|
||||
panic!("file cache not in-memory unimplemented");
|
||||
}
|
||||
|
||||
let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
|
||||
info!(
|
||||
target = bytes_to_mebibytes(expected_usage),
|
||||
total = bytes_to_mebibytes(new_mem),
|
||||
"updating file cache size",
|
||||
);
|
||||
|
||||
let actual_usage = file_cache
|
||||
.set_file_cache_size(expected_usage)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
|
||||
if actual_usage != expected_usage {
|
||||
warn!(
|
||||
"file cache was set to a different size that we wanted: target = {} Mib, actual= {} Mib",
|
||||
bytes_to_mebibytes(expected_usage),
|
||||
bytes_to_mebibytes(actual_usage)
|
||||
)
|
||||
}
|
||||
file_cache_mem_usage = actual_usage;
|
||||
}
|
||||
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let available_memory = usable_system_memory - file_cache_mem_usage;
|
||||
let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
|
||||
info!(
|
||||
target = bytes_to_mebibytes(new_cgroup_mem_high),
|
||||
total = bytes_to_mebibytes(new_mem),
|
||||
name = cgroup.path(),
|
||||
"updating cgroup memory.high",
|
||||
);
|
||||
let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory);
|
||||
cgroup
|
||||
.set_limits(&limits)
|
||||
.context("failed to set file cache size")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Take in a message and perform some action, such as downscaling or upscaling,
|
||||
/// and return a message to be send back.
|
||||
#[tracing::instrument(skip_all, fields(%id, message = ?inner))]
|
||||
pub async fn process_message(
|
||||
&mut self,
|
||||
InboundMsg { inner, id }: InboundMsg,
|
||||
) -> anyhow::Result<Option<OutboundMsg>> {
|
||||
match inner {
|
||||
InboundMsgKind::UpscaleNotification { granted } => {
|
||||
self.handle_upscale(granted)
|
||||
.await
|
||||
.context("failed to handle upscale")?;
|
||||
self.dispatcher
|
||||
.notify_upscale(Sequenced::new(granted))
|
||||
.await
|
||||
.context("failed to notify notify cgroup of upscale")?;
|
||||
Ok(Some(OutboundMsg::new(
|
||||
OutboundMsgKind::UpscaleConfirmation {},
|
||||
id,
|
||||
)))
|
||||
}
|
||||
InboundMsgKind::DownscaleRequest { target } => self
|
||||
.try_downscale(target)
|
||||
.await
|
||||
.context("failed to downscale")
|
||||
.map(|(ok, status)| {
|
||||
Some(OutboundMsg::new(
|
||||
OutboundMsgKind::DownscaleResult { ok, status },
|
||||
id,
|
||||
))
|
||||
}),
|
||||
InboundMsgKind::InvalidMessage { error } => {
|
||||
warn!(
|
||||
%error, id, "received notification of an invalid message we sent"
|
||||
);
|
||||
Ok(None)
|
||||
}
|
||||
InboundMsgKind::InternalError { error } => {
|
||||
warn!(error, id, "agent experienced an internal error");
|
||||
Ok(None)
|
||||
}
|
||||
InboundMsgKind::HealthCheck {} => {
|
||||
Ok(Some(OutboundMsg::new(OutboundMsgKind::HealthCheck {}, id)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: don't propagate errors, probably just warn!?
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn run(&mut self) -> anyhow::Result<()> {
|
||||
info!("starting dispatcher");
|
||||
loop {
|
||||
tokio::select! {
|
||||
signal = self.kill.recv() => {
|
||||
match signal {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(e) => bail!("failed to receive kill signal: {e}")
|
||||
}
|
||||
}
|
||||
// we need to propagate an upscale request
|
||||
request = self.dispatcher.request_upscale_events.recv() => {
|
||||
if request.is_none() {
|
||||
bail!("failed to listen for upscale event from cgroup")
|
||||
}
|
||||
info!("cgroup asking for upscale; forwarding request");
|
||||
self.counter += 2; // Increment, preserving parity (i.e. keep the
|
||||
// counter odd). See the field comment for more.
|
||||
self.dispatcher
|
||||
.send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
|
||||
.await
|
||||
.context("failed to send message")?;
|
||||
}
|
||||
// there is a message from the agent
|
||||
msg = self.dispatcher.source.next() => {
|
||||
if let Some(msg) = msg {
|
||||
// Don't use 'message' as a key as the string also uses
|
||||
// that for its key
|
||||
info!(?msg, "received message");
|
||||
match msg {
|
||||
Ok(msg) => {
|
||||
let message: InboundMsg = match msg {
|
||||
Message::Text(text) => {
|
||||
serde_json::from_str(&text).context("failed to deserialize text message")?
|
||||
}
|
||||
other => {
|
||||
warn!(
|
||||
// Don't use 'message' as a key as the
|
||||
// string also uses that for its key
|
||||
msg = ?other,
|
||||
"agent should only send text messages but received different type"
|
||||
);
|
||||
continue
|
||||
},
|
||||
};
|
||||
|
||||
let out = match self.process_message(message.clone()).await {
|
||||
Ok(Some(out)) => out,
|
||||
Ok(None) => continue,
|
||||
Err(e) => {
|
||||
let error = e.to_string();
|
||||
warn!(?error, "error handling message");
|
||||
OutboundMsg::new(
|
||||
OutboundMsgKind::InternalError {
|
||||
error
|
||||
},
|
||||
message.id
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
self.dispatcher
|
||||
.send(out)
|
||||
.await
|
||||
.context("failed to send message")?;
|
||||
}
|
||||
Err(e) => warn!("{e}"),
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("dispatcher connection closed")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -51,6 +51,7 @@ serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
serde_with.workspace = true
|
||||
signal-hook.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
svg_fmt.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
|
||||
@@ -215,7 +215,6 @@ fn bench_sequential(c: &mut Criterion) {
|
||||
TimelineId::generate(),
|
||||
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||
Lsn(i),
|
||||
false,
|
||||
0,
|
||||
);
|
||||
updates.insert_historic(layer);
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::{fs, path::Path, str};
|
||||
|
||||
use pageserver::page_cache::PAGE_SZ;
|
||||
use pageserver::repository::{Key, KEY_SIZE};
|
||||
use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
|
||||
use pageserver::tenant::block_io::FileBlockReader;
|
||||
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
|
||||
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
|
||||
use pageserver::tenant::storage_layer::range_overlaps;
|
||||
|
||||
@@ -44,8 +44,6 @@ pub(crate) enum LayerCmd {
|
||||
}
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
use pageserver::tenant::block_io::BlockReader;
|
||||
|
||||
let path = path.as_ref();
|
||||
virtual_file::init(10);
|
||||
page_cache::init(100);
|
||||
@@ -70,7 +68,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let cursor = BlockCursor::new(&file);
|
||||
let cursor = BlockCursor::new_fileblockreader_virtual(&file);
|
||||
for (k, v) in all {
|
||||
let value = cursor.read_blob(v.pos()).await?;
|
||||
println!("key:{} value_len:{}", k, value.len());
|
||||
|
||||
@@ -11,6 +11,7 @@ use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use pageserver::tenant::TenantSharedResources;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::time::Instant;
|
||||
use tracing::*;
|
||||
@@ -382,8 +383,10 @@ fn start_pageserver(
|
||||
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||
conf,
|
||||
broker_client.clone(),
|
||||
remote_storage.clone(),
|
||||
TenantSharedResources {
|
||||
broker_client: broker_client.clone(),
|
||||
remote_storage: remote_storage.clone(),
|
||||
},
|
||||
order,
|
||||
))?;
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use metrics::{
|
||||
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use strum::VariantNames;
|
||||
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
@@ -570,23 +570,160 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
|
||||
"get_rel_exists",
|
||||
"get_rel_size",
|
||||
"get_page_at_lsn",
|
||||
"get_db_size",
|
||||
];
|
||||
#[derive(Debug)]
|
||||
struct GlobalAndPerTimelineHistogram {
|
||||
global: Histogram,
|
||||
per_tenant_timeline: Histogram,
|
||||
}
|
||||
|
||||
pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
impl GlobalAndPerTimelineHistogram {
|
||||
fn observe(&self, value: f64) {
|
||||
self.global.observe(value);
|
||||
self.per_tenant_timeline.observe(value);
|
||||
}
|
||||
}
|
||||
|
||||
struct GlobalAndPerTimelineHistogramTimer<'a> {
|
||||
h: &'a GlobalAndPerTimelineHistogram,
|
||||
start: std::time::Instant,
|
||||
}
|
||||
|
||||
impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
|
||||
fn drop(&mut self) {
|
||||
let elapsed = self.start.elapsed();
|
||||
self.h.observe(elapsed.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
Copy,
|
||||
IntoStaticStr,
|
||||
strum_macros::EnumCount,
|
||||
strum_macros::EnumIter,
|
||||
strum_macros::FromRepr,
|
||||
)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub enum SmgrQueryType {
|
||||
GetRelExists,
|
||||
GetRelSize,
|
||||
GetPageAtLsn,
|
||||
GetDbSize,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct SmgrQueryTimePerTimeline {
|
||||
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
|
||||
}
|
||||
|
||||
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds",
|
||||
"Time spent on smgr query handling",
|
||||
"Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
|
||||
&["smgr_query_type", "tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds_global",
|
||||
"Time spent on smgr query handling, aggregated by query type.",
|
||||
&["smgr_query_type"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
impl SmgrQueryTimePerTimeline {
|
||||
pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let metrics = std::array::from_fn(|i| {
|
||||
let op = SmgrQueryType::from_repr(i).unwrap();
|
||||
let global = SMGR_QUERY_TIME_GLOBAL
|
||||
.get_metric_with_label_values(&[op.into()])
|
||||
.unwrap();
|
||||
let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
||||
.get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
GlobalAndPerTimelineHistogram {
|
||||
global,
|
||||
per_tenant_timeline,
|
||||
}
|
||||
});
|
||||
Self { metrics }
|
||||
}
|
||||
pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
|
||||
let metric = &self.metrics[op as usize];
|
||||
GlobalAndPerTimelineHistogramTimer {
|
||||
h: metric,
|
||||
start: std::time::Instant::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod smgr_query_time_tests {
|
||||
use strum::IntoEnumIterator;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
// Regression test, we used hard-coded string constants before using an enum.
|
||||
#[test]
|
||||
fn op_label_name() {
|
||||
use super::SmgrQueryType::*;
|
||||
let expect: [(super::SmgrQueryType, &'static str); 4] = [
|
||||
(GetRelExists, "get_rel_exists"),
|
||||
(GetRelSize, "get_rel_size"),
|
||||
(GetPageAtLsn, "get_page_at_lsn"),
|
||||
(GetDbSize, "get_db_size"),
|
||||
];
|
||||
for (op, expect) in expect {
|
||||
let actual: &'static str = op.into();
|
||||
assert_eq!(actual, expect);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let ops: Vec<_> = super::SmgrQueryType::iter().collect();
|
||||
|
||||
for op in &ops {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||
|
||||
let get_counts = || {
|
||||
let global: u64 = ops
|
||||
.iter()
|
||||
.map(|op| metrics.metrics[*op as usize].global.get_sample_count())
|
||||
.sum();
|
||||
let per_tenant_timeline: u64 = ops
|
||||
.iter()
|
||||
.map(|op| {
|
||||
metrics.metrics[*op as usize]
|
||||
.per_tenant_timeline
|
||||
.get_sample_count()
|
||||
})
|
||||
.sum();
|
||||
(global, per_tenant_timeline)
|
||||
};
|
||||
|
||||
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
||||
assert_eq!(pre_per_tenant_timeline, 0);
|
||||
|
||||
let timer = metrics.start_timer(*op);
|
||||
drop(timer);
|
||||
|
||||
let (post_global, post_per_tenant_timeline) = get_counts();
|
||||
assert_eq!(post_per_tenant_timeline, 1);
|
||||
assert!(post_global > pre_global);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// keep in sync with control plane Go code so that we can validate
|
||||
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
|
||||
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
@@ -1045,6 +1182,12 @@ impl Drop for TimelineMetrics {
|
||||
.write()
|
||||
.unwrap()
|
||||
.remove(tenant_id, timeline_id);
|
||||
|
||||
// The following metrics are born outside of the TimelineMetrics lifecycle but still
|
||||
// removed at the end of it. The idea is to have the metrics outlive the
|
||||
// entity during which they're observed, e.g., the smgr metrics shall
|
||||
// outlive an individual smgr connection, but not the timeline.
|
||||
|
||||
for op in StorageTimeOperation::VARIANTS {
|
||||
let _ =
|
||||
STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
@@ -1056,8 +1199,12 @@ impl Drop for TimelineMetrics {
|
||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
|
||||
for op in SMGR_QUERY_TIME_OPERATIONS {
|
||||
let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
for op in SmgrQueryType::iter() {
|
||||
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
op.into(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -203,6 +203,11 @@ impl Slot {
|
||||
Err(usage_count) => usage_count,
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the usage count to a specific value.
|
||||
fn set_usage_count(&self, count: u8) {
|
||||
self.usage_count.store(count, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PageCache {
|
||||
@@ -263,6 +268,7 @@ pub struct PageWriteGuard<'i> {
|
||||
inner: RwLockWriteGuard<'i, SlotInner>,
|
||||
|
||||
// Are the page contents currently valid?
|
||||
// Used to mark pages as invalid that are assigned but not yet filled with data.
|
||||
valid: bool,
|
||||
}
|
||||
|
||||
@@ -425,27 +431,6 @@ impl PageCache {
|
||||
self.lock_for_read(&mut cache_key)
|
||||
}
|
||||
|
||||
/// Immediately drop all buffers belonging to given file
|
||||
pub fn drop_buffers_for_immutable(&self, drop_file_id: FileId) {
|
||||
for slot_idx in 0..self.slots.len() {
|
||||
let slot = &self.slots[slot_idx];
|
||||
|
||||
let mut inner = slot.inner.write().unwrap();
|
||||
if let Some(key) = &inner.key {
|
||||
match key {
|
||||
CacheKey::ImmutableFilePage { file_id, blkno: _ }
|
||||
if *file_id == drop_file_id =>
|
||||
{
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(key);
|
||||
inner.key = None;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Section 2: Internal interface functions for lookup/update.
|
||||
//
|
||||
@@ -556,7 +541,7 @@ impl PageCache {
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
slot.set_usage_count(1);
|
||||
|
||||
return Ok(ReadBufResult::NotFound(PageWriteGuard {
|
||||
inner,
|
||||
@@ -617,7 +602,7 @@ impl PageCache {
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
slot.set_usage_count(1);
|
||||
|
||||
return Ok(WriteBufResult::NotFound(PageWriteGuard {
|
||||
inner,
|
||||
@@ -816,6 +801,8 @@ impl PageCache {
|
||||
fn new(num_pages: usize) -> Self {
|
||||
assert!(num_pages > 0, "page cache size must be > 0");
|
||||
|
||||
// We use Box::leak here and into_boxed_slice to avoid leaking uninitialized
|
||||
// memory that Vec's might contain.
|
||||
let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
|
||||
|
||||
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
|
||||
|
||||
@@ -50,7 +50,8 @@ use crate::basebackup;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||
use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant;
|
||||
@@ -306,39 +307,6 @@ async fn page_service_conn_main(
|
||||
}
|
||||
}
|
||||
|
||||
struct PageRequestMetrics {
|
||||
get_rel_exists: metrics::Histogram,
|
||||
get_rel_size: metrics::Histogram,
|
||||
get_page_at_lsn: metrics::Histogram,
|
||||
get_db_size: metrics::Histogram,
|
||||
}
|
||||
|
||||
impl PageRequestMetrics {
|
||||
fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
|
||||
let get_rel_exists =
|
||||
SMGR_QUERY_TIME.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]);
|
||||
|
||||
let get_rel_size =
|
||||
SMGR_QUERY_TIME.with_label_values(&["get_rel_size", &tenant_id, &timeline_id]);
|
||||
|
||||
let get_page_at_lsn =
|
||||
SMGR_QUERY_TIME.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]);
|
||||
|
||||
let get_db_size =
|
||||
SMGR_QUERY_TIME.with_label_values(&["get_db_size", &tenant_id, &timeline_id]);
|
||||
|
||||
Self {
|
||||
get_rel_exists,
|
||||
get_rel_size,
|
||||
get_page_at_lsn,
|
||||
get_db_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct PageServerHandler {
|
||||
_conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
@@ -406,7 +374,7 @@ impl PageServerHandler {
|
||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id);
|
||||
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||
|
||||
loop {
|
||||
let msg = tokio::select! {
|
||||
@@ -446,21 +414,21 @@ impl PageServerHandler {
|
||||
|
||||
let response = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let _timer = metrics.get_rel_exists.start_timer();
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
|
||||
self.handle_get_rel_exists_request(&timeline, &req, &ctx)
|
||||
.await
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
let _timer = metrics.get_rel_size.start_timer();
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
|
||||
self.handle_get_nblocks_request(&timeline, &req, &ctx).await
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
let _timer = metrics.get_page_at_lsn.start_timer();
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
|
||||
self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
|
||||
.await
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
let _timer = metrics.get_db_size.start_timer();
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
|
||||
self.handle_db_size_request(&timeline, &req, &ctx).await
|
||||
}
|
||||
};
|
||||
@@ -984,8 +952,8 @@ where
|
||||
false
|
||||
};
|
||||
|
||||
metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*crate::metrics::BASEBACKUP_QUERY_TIME,
|
||||
::metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*metrics::BASEBACKUP_QUERY_TIME,
|
||||
async move {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
|
||||
@@ -56,6 +56,7 @@ use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::uninit::TimelineUninitMark;
|
||||
use self::timeline::uninit::UninitializedTimeline;
|
||||
use self::timeline::EvictionTaskTenantState;
|
||||
use self::timeline::TimelineResources;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir;
|
||||
@@ -71,7 +72,6 @@ use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
|
||||
use crate::tenant::storage_layer::DeltaLayer;
|
||||
use crate::tenant::storage_layer::ImageLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::InitializationOrder;
|
||||
|
||||
use crate::tenant::timeline::delete::DeleteTimelineFlow;
|
||||
@@ -150,6 +150,14 @@ pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";
|
||||
|
||||
pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
|
||||
|
||||
/// References to shared objects that are passed into each tenant, such
|
||||
/// as the shared remote storage client and process initialization state.
|
||||
#[derive(Clone)]
|
||||
pub struct TenantSharedResources {
|
||||
pub broker_client: storage_broker::BrokerClientChannel,
|
||||
pub remote_storage: Option<GenericRemoteStorage>,
|
||||
}
|
||||
|
||||
///
|
||||
/// Tenant consists of multiple timelines. Keep them in a hash table.
|
||||
///
|
||||
@@ -389,7 +397,7 @@ impl Tenant {
|
||||
async fn timeline_init_and_sync(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
resources: TimelineResources,
|
||||
remote_startup_data: Option<RemoteStartupData>,
|
||||
local_metadata: Option<TimelineMetadata>,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
@@ -410,17 +418,57 @@ impl Tenant {
|
||||
timeline_id,
|
||||
up_to_date_metadata,
|
||||
ancestor.clone(),
|
||||
remote_client,
|
||||
resources,
|
||||
init_order,
|
||||
CreateTimelineCause::Load,
|
||||
)?;
|
||||
let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
anyhow::ensure!(
|
||||
new_disk_consistent_lsn.is_valid(),
|
||||
disk_consistent_lsn.is_valid(),
|
||||
"Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
disk_consistent_lsn,
|
||||
up_to_date_metadata.disk_consistent_lsn(),
|
||||
"these are used interchangeably"
|
||||
);
|
||||
|
||||
// Save the metadata file to local disk.
|
||||
if !picked_local {
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
up_to_date_metadata,
|
||||
first_save,
|
||||
)
|
||||
.context("save_metadata")?;
|
||||
}
|
||||
|
||||
let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
|
||||
|
||||
if let Some(index_part) = index_part {
|
||||
timeline
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.init_upload_queue(index_part)?;
|
||||
} else if self.remote_storage.is_some() {
|
||||
// No data on the remote storage, but we have local metadata file. We can end up
|
||||
// here with timeline_create being interrupted before finishing index part upload.
|
||||
// By doing what we do here, the index part upload is retried.
|
||||
// If control plane retries timeline creation in the meantime, the mgmt API handler
|
||||
// for timeline creation will coalesce on the upload we queue here.
|
||||
let rtc = timeline.remote_client.as_ref().unwrap();
|
||||
rtc.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
|
||||
rtc.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
|
||||
}
|
||||
|
||||
timeline
|
||||
.load_layer_map(new_disk_consistent_lsn)
|
||||
.load_layer_map(
|
||||
disk_consistent_lsn,
|
||||
remote_startup_data.map(|x| x.index_part),
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
|
||||
@@ -444,19 +492,6 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
if self.remote_storage.is_some() {
|
||||
// Reconcile local state with remote storage, downloading anything that's
|
||||
// missing locally, and scheduling uploads for anything that's missing
|
||||
// in remote storage.
|
||||
timeline
|
||||
.reconcile_with_remote(
|
||||
up_to_date_metadata,
|
||||
remote_startup_data.as_ref().map(|r| &r.index_part),
|
||||
)
|
||||
.await
|
||||
.context("failed to reconcile with remote")?
|
||||
}
|
||||
|
||||
// Sanity check: a timeline should have some content.
|
||||
anyhow::ensure!(
|
||||
ancestor.is_some()
|
||||
@@ -471,18 +506,6 @@ impl Tenant {
|
||||
"Timeline has no ancestor and no layer files"
|
||||
);
|
||||
|
||||
// Save the metadata file to local disk.
|
||||
if !picked_local {
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
up_to_date_metadata,
|
||||
first_save,
|
||||
)
|
||||
.context("save_metadata")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -675,10 +698,7 @@ impl Tenant {
|
||||
debug!("successfully downloaded index part for timeline {timeline_id}");
|
||||
match index_part {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => {
|
||||
timeline_ancestors.insert(
|
||||
timeline_id,
|
||||
index_part.parse_metadata().context("parse_metadata")?,
|
||||
);
|
||||
timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
|
||||
remote_index_and_client.insert(timeline_id, (index_part, client));
|
||||
}
|
||||
MaybeDeletedIndexPart::Deleted(index_part) => {
|
||||
@@ -701,14 +721,22 @@ impl Tenant {
|
||||
.expect("just put it in above");
|
||||
|
||||
// TODO again handle early failure
|
||||
self.load_remote_timeline(timeline_id, index_part, remote_metadata, remote_client, ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to load remote timeline {} for tenant {}",
|
||||
timeline_id, self.tenant_id
|
||||
)
|
||||
})?;
|
||||
self.load_remote_timeline(
|
||||
timeline_id,
|
||||
index_part,
|
||||
remote_metadata,
|
||||
TimelineResources {
|
||||
remote_client: Some(remote_client),
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to load remote timeline {} for tenant {}",
|
||||
timeline_id, self.tenant_id
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
// Walk through deleted timelines, resume deletion
|
||||
@@ -721,7 +749,7 @@ impl Tenant {
|
||||
DeleteTimelineFlow::resume_deletion(
|
||||
Arc::clone(self),
|
||||
timeline_id,
|
||||
&index_part.parse_metadata().context("parse_metadata")?,
|
||||
&index_part.metadata,
|
||||
Some(remote_timeline_client),
|
||||
None,
|
||||
)
|
||||
@@ -763,7 +791,7 @@ impl Tenant {
|
||||
timeline_id: TimelineId,
|
||||
index_part: IndexPart,
|
||||
remote_metadata: TimelineMetadata,
|
||||
remote_client: RemoteTimelineClient,
|
||||
resources: TimelineResources,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
@@ -793,7 +821,7 @@ impl Tenant {
|
||||
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
Some(remote_client),
|
||||
resources,
|
||||
Some(RemoteStartupData {
|
||||
index_part,
|
||||
remote_metadata,
|
||||
@@ -840,8 +868,7 @@ impl Tenant {
|
||||
pub(crate) fn spawn_load(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
resources: TenantSharedResources,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
@@ -856,6 +883,9 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
let broker_client = resources.broker_client;
|
||||
let remote_storage = resources.remote_storage;
|
||||
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||
let tenant = Tenant::new(
|
||||
TenantState::Loading,
|
||||
@@ -1241,16 +1271,9 @@ impl Tenant {
|
||||
) -> Result<(), LoadLocalTimelineError> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
|
||||
RemoteTimelineClient::new(
|
||||
remote_storage.clone(),
|
||||
self.conf,
|
||||
self.tenant_id,
|
||||
timeline_id,
|
||||
)
|
||||
});
|
||||
let mut resources = self.build_timeline_resources(timeline_id);
|
||||
|
||||
let (remote_startup_data, remote_client) = match remote_client {
|
||||
let (remote_startup_data, remote_client) = match resources.remote_client {
|
||||
Some(remote_client) => match remote_client.download_index_file().await {
|
||||
Ok(index_part) => {
|
||||
let index_part = match index_part {
|
||||
@@ -1288,10 +1311,7 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
let remote_metadata = index_part
|
||||
.parse_metadata()
|
||||
.context("parse_metadata")
|
||||
.map_err(LoadLocalTimelineError::Load)?;
|
||||
let remote_metadata = index_part.metadata.clone();
|
||||
(
|
||||
Some(RemoteStartupData {
|
||||
index_part,
|
||||
@@ -1338,9 +1358,10 @@ impl Tenant {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
(None, remote_client)
|
||||
(None, resources.remote_client)
|
||||
}
|
||||
};
|
||||
resources.remote_client = remote_client;
|
||||
|
||||
let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
|
||||
let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
|
||||
@@ -1353,7 +1374,7 @@ impl Tenant {
|
||||
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
remote_client,
|
||||
resources,
|
||||
remote_startup_data,
|
||||
Some(local_metadata),
|
||||
ancestor,
|
||||
@@ -2225,7 +2246,7 @@ impl Tenant {
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
resources: TimelineResources,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
cause: CreateTimelineCause,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
@@ -2254,7 +2275,7 @@ impl Tenant {
|
||||
new_timeline_id,
|
||||
self.tenant_id,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
remote_client,
|
||||
resources,
|
||||
pg_version,
|
||||
initial_logical_size_can_start.cloned(),
|
||||
initial_logical_size_attempt.cloned().flatten(),
|
||||
@@ -2902,6 +2923,23 @@ impl Tenant {
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
/// Call this before constructing a timeline, to build its required structures
|
||||
fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
|
||||
let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
|
||||
let remote_client = RemoteTimelineClient::new(
|
||||
remote_storage.clone(),
|
||||
self.conf,
|
||||
self.tenant_id,
|
||||
timeline_id,
|
||||
);
|
||||
Some(remote_client)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
TimelineResources { remote_client }
|
||||
}
|
||||
|
||||
/// Creates intermediate timeline structure and its files.
|
||||
///
|
||||
/// An empty layer map is initialized, and new data and WAL can be imported starting
|
||||
@@ -2918,25 +2956,17 @@ impl Tenant {
|
||||
) -> anyhow::Result<UninitializedTimeline> {
|
||||
let tenant_id = self.tenant_id;
|
||||
|
||||
let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
|
||||
let remote_client = RemoteTimelineClient::new(
|
||||
remote_storage.clone(),
|
||||
self.conf,
|
||||
tenant_id,
|
||||
new_timeline_id,
|
||||
);
|
||||
let resources = self.build_timeline_resources(new_timeline_id);
|
||||
if let Some(remote_client) = &resources.remote_client {
|
||||
remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
|
||||
Some(remote_client)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
}
|
||||
|
||||
let timeline_struct = self
|
||||
.create_timeline_struct(
|
||||
new_timeline_id,
|
||||
new_metadata,
|
||||
ancestor,
|
||||
remote_client,
|
||||
resources,
|
||||
None,
|
||||
CreateTimelineCause::Load,
|
||||
)
|
||||
@@ -4071,7 +4101,7 @@ mod tests {
|
||||
let mut found_error_message = false;
|
||||
let mut err_source = err.source();
|
||||
while let Some(source) = err_source {
|
||||
if source.to_string() == "metadata checksum mismatch" {
|
||||
if source.to_string().contains("metadata checksum mismatch") {
|
||||
found_error_message = true;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -12,14 +12,11 @@
|
||||
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||
//!
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockReader};
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
|
||||
impl<R> BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
impl<'a> BlockCursor<'a> {
|
||||
/// Read a blob into a new buffer.
|
||||
pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
|
||||
let mut buf = Vec::new();
|
||||
|
||||
@@ -2,8 +2,12 @@
|
||||
//! Low-level Block-oriented I/O functions
|
||||
//!
|
||||
|
||||
use super::ephemeral_file::EphemeralFile;
|
||||
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
|
||||
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use bytes::Bytes;
|
||||
use std::fs::File;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
@@ -13,32 +17,20 @@ use std::os::unix::fs::FileExt;
|
||||
/// There are currently two implementations: EphemeralFile, and FileBlockReader
|
||||
/// below.
|
||||
pub trait BlockReader {
|
||||
///
|
||||
/// Read a block. Returns a "lease" object that can be used to
|
||||
/// access to the contents of the page. (For the page cache, the
|
||||
/// lease object represents a lock on the buffer.)
|
||||
///
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;
|
||||
|
||||
///
|
||||
/// Create a new "cursor" for reading from this reader.
|
||||
///
|
||||
/// A cursor caches the last accessed page, allowing for faster
|
||||
/// access if the same block is accessed repeatedly.
|
||||
fn block_cursor(&self) -> BlockCursor<&Self>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
BlockCursor::new(self)
|
||||
}
|
||||
fn block_cursor(&self) -> BlockCursor<'_>;
|
||||
}
|
||||
|
||||
impl<B> BlockReader for &B
|
||||
where
|
||||
B: BlockReader,
|
||||
{
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
(*self).read_blk(blknum)
|
||||
fn block_cursor(&self) -> BlockCursor<'_> {
|
||||
(*self).block_cursor()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,6 +68,34 @@ impl<'a> Deref for BlockLease<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Provides the ability to read blocks from different sources,
|
||||
/// similar to using traits for this purpose.
|
||||
///
|
||||
/// Unlike traits, we also support the read function to be async though.
|
||||
pub(crate) enum BlockReaderRef<'a> {
|
||||
FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
|
||||
FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
|
||||
EphemeralFile(&'a EphemeralFile),
|
||||
Adapter(Adapter<&'a DeltaLayerInner>),
|
||||
#[cfg(test)]
|
||||
TestDisk(&'a super::disk_btree::tests::TestDisk),
|
||||
}
|
||||
|
||||
impl<'a> BlockReaderRef<'a> {
|
||||
#[inline(always)]
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
use BlockReaderRef::*;
|
||||
match self {
|
||||
FileBlockReaderVirtual(r) => r.read_blk(blknum),
|
||||
FileBlockReaderFile(r) => r.read_blk(blknum),
|
||||
EphemeralFile(r) => r.read_blk(blknum),
|
||||
Adapter(r) => r.read_blk(blknum),
|
||||
#[cfg(test)]
|
||||
TestDisk(r) => r.read_blk(blknum),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A "cursor" for efficiently reading multiple pages from a BlockReader
|
||||
///
|
||||
@@ -93,21 +113,27 @@ impl<'a> Deref for BlockLease<'a> {
|
||||
/// // do stuff with 'buf'
|
||||
/// ```
|
||||
///
|
||||
pub struct BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
reader: R,
|
||||
pub struct BlockCursor<'a> {
|
||||
reader: BlockReaderRef<'a>,
|
||||
}
|
||||
|
||||
impl<R> BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
pub fn new(reader: R) -> Self {
|
||||
impl<'a> BlockCursor<'a> {
|
||||
pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
|
||||
BlockCursor { reader }
|
||||
}
|
||||
// Needed by cli
|
||||
pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
|
||||
BlockCursor {
|
||||
reader: BlockReaderRef::FileBlockReaderVirtual(reader),
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a block.
|
||||
///
|
||||
/// Returns a "lease" object that can be used to
|
||||
/// access to the contents of the page. (For the page cache, the
|
||||
/// lease object represents a lock on the buffer.)
|
||||
#[inline(always)]
|
||||
pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
self.reader.read_blk(blknum)
|
||||
}
|
||||
@@ -139,13 +165,12 @@ where
|
||||
assert!(buf.len() == PAGE_SZ);
|
||||
self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl<F> BlockReader for FileBlockReader<F>
|
||||
where
|
||||
F: FileExt,
|
||||
{
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
/// Read a block.
|
||||
///
|
||||
/// Returns a "lease" object that can be used to
|
||||
/// access to the contents of the page. (For the page cache, the
|
||||
/// lease object represents a lock on the buffer.)
|
||||
pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
@@ -170,6 +195,18 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockReader for FileBlockReader<File> {
|
||||
fn block_cursor(&self) -> BlockCursor<'_> {
|
||||
BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockReader for FileBlockReader<VirtualFile> {
|
||||
fn block_cursor(&self) -> BlockCursor<'_> {
|
||||
BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Trait for block-oriented output
|
||||
///
|
||||
|
||||
@@ -7,6 +7,7 @@ use anyhow::Context;
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, instrument, warn, Instrument, Span};
|
||||
|
||||
use utils::{
|
||||
@@ -82,6 +83,8 @@ async fn create_remote_delete_mark(
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"mark_upload",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await
|
||||
.context("mark_upload")?;
|
||||
@@ -171,6 +174,8 @@ async fn remove_tenant_remote_delete_mark(
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
@@ -252,6 +257,8 @@ pub(crate) async fn remote_delete_mark_exists(
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
"fetch_tenant_deletion_mark",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
||||
@@ -259,9 +259,10 @@ where
|
||||
{
|
||||
let mut stack = Vec::new();
|
||||
stack.push((self.root_blk, None));
|
||||
let block_cursor = self.reader.block_cursor();
|
||||
while let Some((node_blknum, opt_iter)) = stack.pop() {
|
||||
// Locate the node.
|
||||
let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
|
||||
let node_buf = block_cursor.read_blk(self.start_blk + node_blknum)?;
|
||||
|
||||
let node = OnDiskNode::deparse(node_buf.as_ref())?;
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
@@ -353,8 +354,10 @@ where
|
||||
|
||||
stack.push((self.root_blk, String::new(), 0, 0, 0));
|
||||
|
||||
let block_cursor = self.reader.block_cursor();
|
||||
|
||||
while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
|
||||
let blk = self.reader.read_blk(self.start_blk + blknum)?;
|
||||
let blk = block_cursor.read_blk(self.start_blk + blknum)?;
|
||||
let buf: &[u8] = blk.as_ref();
|
||||
let node = OnDiskNode::<L>::deparse(buf)?;
|
||||
|
||||
@@ -683,29 +686,32 @@ impl<const L: usize> BuildNode<L> {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
pub(crate) mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::block_io::BlockLease;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
|
||||
use rand::Rng;
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct TestDisk {
|
||||
pub(crate) struct TestDisk {
|
||||
blocks: Vec<Bytes>,
|
||||
}
|
||||
impl TestDisk {
|
||||
fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
}
|
||||
impl BlockReader for TestDisk {
|
||||
fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
|
||||
pub(crate) fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
|
||||
let mut buf = [0u8; PAGE_SZ];
|
||||
buf.copy_from_slice(&self.blocks[blknum as usize]);
|
||||
Ok(std::rc::Rc::new(buf).into())
|
||||
}
|
||||
}
|
||||
impl BlockReader for TestDisk {
|
||||
fn block_cursor(&self) -> BlockCursor<'_> {
|
||||
BlockCursor::new(BlockReaderRef::TestDisk(self))
|
||||
}
|
||||
}
|
||||
impl BlockWriter for &mut TestDisk {
|
||||
fn write_blk(&mut self, buf: Bytes) -> io::Result<u32> {
|
||||
let blknum = self.blocks.len();
|
||||
|
||||
@@ -3,8 +3,7 @@
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockLease, BlockReader};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use std::cmp::min;
|
||||
use std::fs::OpenOptions;
|
||||
@@ -22,7 +21,7 @@ pub struct EphemeralFile {
|
||||
_tenant_id: TenantId,
|
||||
_timeline_id: TimelineId,
|
||||
file: VirtualFile,
|
||||
size: u64,
|
||||
len: u64,
|
||||
/// An ephemeral file is append-only.
|
||||
/// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
|
||||
/// The other pages, which can no longer be modified, are accessed through the page cache.
|
||||
@@ -53,27 +52,56 @@ impl EphemeralFile {
|
||||
_tenant_id: tenant_id,
|
||||
_timeline_id: timeline_id,
|
||||
file,
|
||||
size: 0,
|
||||
len: 0,
|
||||
mutable_tail: [0u8; PAGE_SZ],
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> u64 {
|
||||
self.size
|
||||
pub(crate) fn len(&self) -> u64 {
|
||||
self.len
|
||||
}
|
||||
}
|
||||
|
||||
/// Does the given filename look like an ephemeral file?
|
||||
pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||
if let Some(rest) = filename.strip_prefix("ephemeral-") {
|
||||
rest.parse::<u32>().is_ok()
|
||||
} else {
|
||||
false
|
||||
pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
|
||||
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
|
||||
if flushed_blknums.contains(&(blknum as u64)) {
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum)
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
// order path before error because error is anyhow::Error => might have many contexts
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum,
|
||||
self.file.path.display(),
|
||||
e,
|
||||
),
|
||||
)
|
||||
})? {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
self.file
|
||||
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// Swap for read lock
|
||||
continue;
|
||||
}
|
||||
};
|
||||
}
|
||||
} else {
|
||||
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
|
||||
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BlobWriter for EphemeralFile {
|
||||
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
|
||||
pub(crate) async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
|
||||
struct Writer<'a> {
|
||||
ephemeral_file: &'a mut EphemeralFile,
|
||||
/// The block to which the next [`push_bytes`] will write.
|
||||
@@ -84,13 +112,13 @@ impl BlobWriter for EphemeralFile {
|
||||
impl<'a> Writer<'a> {
|
||||
fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
|
||||
Ok(Writer {
|
||||
blknum: (ephemeral_file.size / PAGE_SZ as u64) as u32,
|
||||
off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
|
||||
blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
|
||||
off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
|
||||
ephemeral_file,
|
||||
})
|
||||
}
|
||||
#[inline(always)]
|
||||
fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
|
||||
async fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
|
||||
let mut src_remaining = src;
|
||||
while !src_remaining.is_empty() {
|
||||
let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
|
||||
@@ -154,39 +182,47 @@ impl BlobWriter for EphemeralFile {
|
||||
}
|
||||
}
|
||||
|
||||
let pos = self.size;
|
||||
let pos = self.len;
|
||||
let mut writer = Writer::new(self)?;
|
||||
|
||||
// Write the length field
|
||||
if srcbuf.len() < 0x80 {
|
||||
// short one-byte length header
|
||||
let len_buf = [srcbuf.len() as u8];
|
||||
writer.push_bytes(&len_buf)?;
|
||||
writer.push_bytes(&len_buf).await?;
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
writer.push_bytes(&len_buf)?;
|
||||
writer.push_bytes(&len_buf).await?;
|
||||
}
|
||||
|
||||
// Write the payload
|
||||
writer.push_bytes(srcbuf)?;
|
||||
writer.push_bytes(srcbuf).await?;
|
||||
|
||||
if srcbuf.len() < 0x80 {
|
||||
self.size += 1;
|
||||
self.len += 1;
|
||||
} else {
|
||||
self.size += 4;
|
||||
self.len += 4;
|
||||
}
|
||||
self.size += srcbuf.len() as u64;
|
||||
self.len += srcbuf.len() as u64;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
}
|
||||
|
||||
/// Does the given filename look like an ephemeral file?
|
||||
pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||
if let Some(rest) = filename.strip_prefix("ephemeral-") {
|
||||
rest.parse::<u32>().is_ok()
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for EphemeralFile {
|
||||
fn drop(&mut self) {
|
||||
// drop all pages from page cache
|
||||
let cache = page_cache::get();
|
||||
cache.drop_buffers_for_immutable(self.page_cache_file_id);
|
||||
// There might still be pages in the [`crate::page_cache`] for this file.
|
||||
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
|
||||
|
||||
// unlink the file
|
||||
let res = std::fs::remove_file(&self.file.path);
|
||||
@@ -207,52 +243,15 @@ impl Drop for EphemeralFile {
|
||||
}
|
||||
|
||||
impl BlockReader for EphemeralFile {
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
|
||||
let flushed_blknums = 0..self.size / PAGE_SZ as u64;
|
||||
if flushed_blknums.contains(&(blknum as u64)) {
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum)
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
// order path before error because error is anyhow::Error => might have many contexts
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum,
|
||||
self.file.path.display(),
|
||||
e,
|
||||
),
|
||||
)
|
||||
})? {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
self.file
|
||||
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// Swap for read lock
|
||||
continue;
|
||||
}
|
||||
};
|
||||
}
|
||||
} else {
|
||||
debug_assert_eq!(blknum as u64, self.size / PAGE_SZ as u64);
|
||||
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
|
||||
}
|
||||
fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
|
||||
BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
|
||||
use rand::{thread_rng, RngCore};
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
@@ -280,12 +279,12 @@ mod tests {
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
|
||||
let pos_foo = file.write_blob(b"foo")?;
|
||||
let pos_foo = file.write_blob(b"foo").await?;
|
||||
assert_eq!(
|
||||
b"foo",
|
||||
file.block_cursor().read_blob(pos_foo).await?.as_slice()
|
||||
);
|
||||
let pos_bar = file.write_blob(b"bar")?;
|
||||
let pos_bar = file.write_blob(b"bar").await?;
|
||||
assert_eq!(
|
||||
b"foo",
|
||||
file.block_cursor().read_blob(pos_foo).await?.as_slice()
|
||||
@@ -298,17 +297,17 @@ mod tests {
|
||||
let mut blobs = Vec::new();
|
||||
for i in 0..10000 {
|
||||
let data = Vec::from(format!("blob{}", i).as_bytes());
|
||||
let pos = file.write_blob(&data)?;
|
||||
let pos = file.write_blob(&data).await?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
// also test with a large blobs
|
||||
for i in 0..100 {
|
||||
let data = format!("blob{}", i).as_bytes().repeat(100);
|
||||
let pos = file.write_blob(&data)?;
|
||||
let pos = file.write_blob(&data).await?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
|
||||
let cursor = BlockCursor::new(&file);
|
||||
let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
|
||||
for (pos, expected) in blobs {
|
||||
let actual = cursor.read_blob(pos).await?;
|
||||
assert_eq!(actual, expected);
|
||||
@@ -318,7 +317,7 @@ mod tests {
|
||||
let mut large_data = Vec::new();
|
||||
large_data.resize(20000, 0);
|
||||
thread_rng().fill_bytes(&mut large_data);
|
||||
let pos_large = file.write_blob(&large_data)?;
|
||||
let pos_large = file.write_blob(&large_data).await?;
|
||||
let result = file.block_cursor().read_blob(pos_large).await?;
|
||||
assert_eq!(result, large_data);
|
||||
|
||||
|
||||
@@ -50,7 +50,6 @@ use crate::context::RequestContext;
|
||||
use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use anyhow::Result;
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
|
||||
@@ -12,7 +12,7 @@ use std::fs::{File, OpenOptions};
|
||||
use std::io::{self, Write};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::{de::Error, Deserialize, Serialize, Serializer};
|
||||
use thiserror::Error;
|
||||
use tracing::info_span;
|
||||
use utils::bin_ser::SerializeError;
|
||||
@@ -232,6 +232,28 @@ impl TimelineMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for TimelineMetadata {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let bytes = Vec::<u8>::deserialize(deserializer)?;
|
||||
Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for TimelineMetadata {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
let bytes = self
|
||||
.to_bytes()
|
||||
.map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
|
||||
bytes.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
/// Save timeline metadata to file
|
||||
pub fn save_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
@@ -29,6 +29,7 @@ use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTenantError;
|
||||
use super::timeline::delete::DeleteTimelineFlow;
|
||||
use super::TenantSharedResources;
|
||||
|
||||
/// The tenants known to the pageserver.
|
||||
/// The enum variants are used to distinguish the different states that the pageserver can be in.
|
||||
@@ -66,8 +67,7 @@ static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::
|
||||
#[instrument(skip_all)]
|
||||
pub async fn init_tenant_mgr(
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
resources: TenantSharedResources,
|
||||
init_order: InitializationOrder,
|
||||
) -> anyhow::Result<()> {
|
||||
// Scan local filesystem for attached tenants
|
||||
@@ -125,8 +125,7 @@ pub async fn init_tenant_mgr(
|
||||
match schedule_local_tenant_processing(
|
||||
conf,
|
||||
&tenant_dir_path,
|
||||
broker_client.clone(),
|
||||
remote_storage.clone(),
|
||||
resources.clone(),
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
&ctx,
|
||||
@@ -162,8 +161,7 @@ pub async fn init_tenant_mgr(
|
||||
pub(crate) fn schedule_local_tenant_processing(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
resources: TenantSharedResources,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
@@ -200,9 +198,15 @@ pub(crate) fn schedule_local_tenant_processing(
|
||||
|
||||
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
|
||||
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
match Tenant::spawn_attach(conf, tenant_id, broker_client, tenants, remote_storage, ctx)
|
||||
{
|
||||
if let Some(remote_storage) = resources.remote_storage {
|
||||
match Tenant::spawn_attach(
|
||||
conf,
|
||||
tenant_id,
|
||||
resources.broker_client,
|
||||
tenants,
|
||||
remote_storage,
|
||||
ctx,
|
||||
) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
|
||||
@@ -220,15 +224,7 @@ pub(crate) fn schedule_local_tenant_processing(
|
||||
} else {
|
||||
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
||||
// Start loading the tenant into memory. It will initially be in Loading state.
|
||||
Tenant::spawn_load(
|
||||
conf,
|
||||
tenant_id,
|
||||
broker_client,
|
||||
remote_storage,
|
||||
init_order,
|
||||
tenants,
|
||||
ctx,
|
||||
)
|
||||
Tenant::spawn_load(conf, tenant_id, resources, init_order, tenants, ctx)
|
||||
};
|
||||
Ok(tenant)
|
||||
}
|
||||
@@ -363,8 +359,12 @@ pub async fn create_tenant(
|
||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
let tenant_resources = TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage,
|
||||
};
|
||||
let created_tenant =
|
||||
schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, &TENANTS, ctx)?;
|
||||
schedule_local_tenant_processing(conf, &tenant_directory, tenant_resources, None, &TENANTS, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -523,7 +523,11 @@ pub async fn load_tenant(
|
||||
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
|
||||
}
|
||||
|
||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, &TENANTS, ctx)
|
||||
let resources = TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage,
|
||||
};
|
||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, resources, None, &TENANTS, ctx)
|
||||
.with_context(|| {
|
||||
format!("Failed to schedule tenant processing in path {tenant_path:?}")
|
||||
})?;
|
||||
@@ -604,7 +608,11 @@ pub async fn attach_tenant(
|
||||
.context("check for attach marker file existence")?;
|
||||
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
|
||||
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, &TENANTS, ctx)?;
|
||||
let resources = TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage: Some(remote_storage),
|
||||
};
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, resources, None, &TENANTS, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
|
||||
@@ -135,7 +135,7 @@
|
||||
//! - Initiate upload queue with that [`IndexPart`].
|
||||
//! - Reschedule all lost operations by comparing the local filesystem state
|
||||
//! and remote state as per [`IndexPart`]. This is done in
|
||||
//! [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
|
||||
//! [`Tenant::timeline_init_and_sync`].
|
||||
//!
|
||||
//! Note that if we crash during file deletion between the index update
|
||||
//! that removes the file from the list of files, and deleting the remote file,
|
||||
@@ -172,7 +172,6 @@
|
||||
//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
|
||||
//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
|
||||
//!
|
||||
//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
|
||||
//! We keep track of the fact that a client is in `Attaching` state in a marker
|
||||
//! file on the local disk. This is critical because, when we restart the pageserver,
|
||||
//! we do not want to do the `List timelines` step for each tenant that has already
|
||||
@@ -192,14 +191,14 @@
|
||||
//! not created and the uploads are skipped.
|
||||
//! Theoretically, it should be ok to remove and re-add remote storage configuration to
|
||||
//! the pageserver config at any time, since it doesn't make a difference to
|
||||
//! `reconcile_with_remote`.
|
||||
//! [`Timeline::load_layer_map`].
|
||||
//! Of course, the remote timeline dir must not change while we have de-configured
|
||||
//! remote storage, i.e., the pageserver must remain the owner of the given prefix
|
||||
//! in remote storage.
|
||||
//! But note that we don't test any of this right now.
|
||||
//!
|
||||
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
|
||||
//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote
|
||||
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
|
||||
|
||||
mod delete;
|
||||
mod download;
|
||||
@@ -211,6 +210,7 @@ use chrono::{NaiveDateTime, Utc};
|
||||
// re-export these
|
||||
pub use download::{is_temp_download_file, list_remote_timelines};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff::{
|
||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
@@ -231,6 +231,7 @@ use crate::metrics::{
|
||||
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
||||
REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
|
||||
};
|
||||
use crate::task_mgr::shutdown_token;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::upload_queue::Delete;
|
||||
@@ -353,6 +354,10 @@ impl RemoteTimelineClient {
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
info!(
|
||||
"initialized upload queue from remote index with {} layer files",
|
||||
index_part.layer_metadata.len()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -365,6 +370,7 @@ impl RemoteTimelineClient {
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_empty_remote(local_metadata)?;
|
||||
self.update_remote_physical_size_gauge(None);
|
||||
info!("initialized upload queue as empty");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -535,8 +541,7 @@ impl RemoteTimelineClient {
|
||||
// ahead of what's _actually_ on the remote during index upload.
|
||||
upload_queue.latest_metadata = metadata.clone();
|
||||
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
self.schedule_index_upload(upload_queue, metadata_bytes);
|
||||
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -556,8 +561,7 @@ impl RemoteTimelineClient {
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
self.schedule_index_upload(upload_queue, metadata_bytes);
|
||||
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -567,7 +571,7 @@ impl RemoteTimelineClient {
|
||||
fn schedule_index_upload(
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
metadata_bytes: Vec<u8>,
|
||||
metadata: TimelineMetadata,
|
||||
) {
|
||||
info!(
|
||||
"scheduling metadata upload with {} files ({} changed)",
|
||||
@@ -580,7 +584,7 @@ impl RemoteTimelineClient {
|
||||
let index_part = IndexPart::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
metadata,
|
||||
);
|
||||
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
@@ -636,7 +640,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Deleting layers doesn't affect the values stored in TimelineMetadata,
|
||||
// so we don't need update it. Just serialize it.
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
let metadata = upload_queue.latest_metadata.clone();
|
||||
|
||||
// Update the remote index file, removing the to-be-deleted files from the index,
|
||||
// before deleting the actual files.
|
||||
@@ -647,12 +651,13 @@ impl RemoteTimelineClient {
|
||||
// to syntactically forbid ? or bail! calls here.
|
||||
let no_bail_here = || {
|
||||
for name in names {
|
||||
upload_queue.latest_files.remove(name);
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
if upload_queue.latest_files.remove(name).is_some() {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue, metadata_bytes);
|
||||
self.schedule_index_upload(upload_queue, metadata);
|
||||
}
|
||||
|
||||
// schedule the actual deletions
|
||||
@@ -754,7 +759,7 @@ impl RemoteTimelineClient {
|
||||
pausable_failpoint!("persist_deleted_index_part");
|
||||
|
||||
backoff::retry(
|
||||
|| async {
|
||||
|| {
|
||||
upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
@@ -762,7 +767,6 @@ impl RemoteTimelineClient {
|
||||
&self.timeline_id,
|
||||
&index_part_with_deleted_at,
|
||||
)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
1,
|
||||
@@ -771,6 +775,8 @@ impl RemoteTimelineClient {
|
||||
// when executed as part of tenant deletion this happens in the background
|
||||
2,
|
||||
"persist_index_part_with_deleted_flag",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -857,6 +863,7 @@ impl RemoteTimelineClient {
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"list_prefixes",
|
||||
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
|
||||
)
|
||||
.await
|
||||
.context("list prefixes")?;
|
||||
@@ -880,6 +887,7 @@ impl RemoteTimelineClient {
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"delete_objects",
|
||||
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
|
||||
)
|
||||
.await
|
||||
.context("delete_objects")?;
|
||||
@@ -901,6 +909,7 @@ impl RemoteTimelineClient {
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"delete_index",
|
||||
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
|
||||
)
|
||||
.await
|
||||
.context("delete_index")?;
|
||||
@@ -1066,6 +1075,15 @@ impl RemoteTimelineClient {
|
||||
.await
|
||||
}
|
||||
UploadOp::UploadMetadata(ref index_part, _lsn) => {
|
||||
let mention_having_future_layers = if cfg!(feature = "testing") {
|
||||
index_part
|
||||
.layer_metadata
|
||||
.keys()
|
||||
.any(|x| x.is_in_future(*_lsn))
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
let res = upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
@@ -1083,6 +1101,10 @@ impl RemoteTimelineClient {
|
||||
.await;
|
||||
if res.is_ok() {
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
if mention_having_future_layers {
|
||||
// find rationale near crate::tenant::timeline::init::cleanup_future_layer
|
||||
tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
|
||||
}
|
||||
}
|
||||
res
|
||||
}
|
||||
@@ -1134,14 +1156,13 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
// sleep until it's time to retry, or we're cancelled
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => { },
|
||||
_ = exponential_backoff(
|
||||
retries,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
) => { },
|
||||
};
|
||||
exponential_backoff(
|
||||
retries,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
&shutdown_token(),
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1578,14 +1599,17 @@ mod tests {
|
||||
};
|
||||
|
||||
assert_file_list(
|
||||
&index_part.timeline_layers,
|
||||
&index_part
|
||||
.layer_metadata
|
||||
.keys()
|
||||
.map(|f| f.to_owned())
|
||||
.collect(),
|
||||
&[
|
||||
&layer_file_name_1.file_name(),
|
||||
&layer_file_name_2.file_name(),
|
||||
],
|
||||
);
|
||||
let downloaded_metadata = index_part.parse_metadata().unwrap();
|
||||
assert_eq!(downloaded_metadata, metadata);
|
||||
assert_eq!(index_part.metadata, metadata);
|
||||
|
||||
// Schedule upload and then a deletion. Check that the deletion is queued
|
||||
client
|
||||
|
||||
@@ -11,6 +11,7 @@ use std::time::Duration;
|
||||
use anyhow::{anyhow, Context};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{backoff, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
@@ -280,6 +281,10 @@ where
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
description,
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
|
||||
unreachable!()
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -62,10 +62,9 @@ pub struct IndexPart {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deleted_at: Option<NaiveDateTime>,
|
||||
|
||||
/// Layer names, which are stored on the remote storage.
|
||||
///
|
||||
/// Additional metadata can might exist in `layer_metadata`.
|
||||
pub timeline_layers: HashSet<LayerFileName>,
|
||||
/// Legacy field: equal to the keys of `layer_metadata`, only written out for forward compat
|
||||
#[serde(default, skip_deserializing)]
|
||||
timeline_layers: HashSet<LayerFileName>,
|
||||
|
||||
/// Per layer file name metadata, which can be present for a present or missing layer file.
|
||||
///
|
||||
@@ -74,10 +73,13 @@ pub struct IndexPart {
|
||||
pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,
|
||||
|
||||
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
|
||||
// It's duplicated here for convenience.
|
||||
// It's duplicated for convenience when reading the serialized structure, but is
|
||||
// private because internally we would read from metadata instead.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
metadata_bytes: Vec<u8>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
|
||||
#[serde(rename = "metadata_bytes")]
|
||||
pub metadata: TimelineMetadata,
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
@@ -85,13 +87,17 @@ impl IndexPart {
|
||||
/// used to understand later versions.
|
||||
///
|
||||
/// Version is currently informative only.
|
||||
const LATEST_VERSION: usize = 2;
|
||||
/// Version history
|
||||
/// - 2: added `deleted_at`
|
||||
/// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
|
||||
/// is always generated from the keys of `layer_metadata`)
|
||||
const LATEST_VERSION: usize = 3;
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
pub fn new(
|
||||
layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata_bytes: Vec<u8>,
|
||||
metadata: TimelineMetadata,
|
||||
) -> Self {
|
||||
let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
|
||||
let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
|
||||
@@ -107,14 +113,10 @@ impl IndexPart {
|
||||
timeline_layers,
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
metadata,
|
||||
deleted_at: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_metadata(&self) -> anyhow::Result<TimelineMetadata> {
|
||||
TimelineMetadata::from_bytes(&self.metadata_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&UploadQueueInitialized> for IndexPart {
|
||||
@@ -122,12 +124,12 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart {
|
||||
|
||||
fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
let metadata = upload_queue.latest_metadata.clone();
|
||||
|
||||
Ok(Self::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
metadata,
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -166,7 +168,7 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
timeline_layers: HashSet::new(),
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
@@ -178,7 +180,7 @@ mod tests {
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
};
|
||||
|
||||
@@ -197,13 +199,13 @@ mod tests {
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
timeline_layers: HashSet::new(),
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
@@ -215,7 +217,7 @@ mod tests {
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
};
|
||||
|
||||
@@ -234,14 +236,14 @@ mod tests {
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
|
||||
"deleted_at": "2023-07-31T09:00:00.123"
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 2,
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
timeline_layers: HashSet::new(),
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
@@ -253,7 +255,7 @@ mod tests {
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
|
||||
};
|
||||
@@ -277,7 +279,7 @@ mod tests {
|
||||
timeline_layers: HashSet::new(),
|
||||
layer_metadata: HashMap::new(),
|
||||
disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [
|
||||
metadata: TimelineMetadata::from_bytes(&[
|
||||
136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
|
||||
38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
|
||||
210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
|
||||
@@ -298,8 +300,8 @@ mod tests {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0,
|
||||
]
|
||||
.to_vec(),
|
||||
])
|
||||
.unwrap(),
|
||||
deleted_at: None,
|
||||
};
|
||||
|
||||
|
||||
@@ -41,8 +41,6 @@ pub use inmemory_layer::InMemoryLayer;
|
||||
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
pub use remote_layer::RemoteLayer;
|
||||
|
||||
use super::timeline::layer_manager::LayerManager;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
T: PartialOrd<T>,
|
||||
@@ -175,16 +173,9 @@ impl LayerAccessStats {
|
||||
///
|
||||
/// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn for_loading_layer(
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
status: LayerResidenceStatus,
|
||||
) -> Self {
|
||||
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
||||
new.record_residence_event(
|
||||
layer_map_lock_held_witness,
|
||||
status,
|
||||
LayerResidenceEventReason::LayerLoad,
|
||||
);
|
||||
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
|
||||
new
|
||||
}
|
||||
|
||||
@@ -197,7 +188,6 @@ impl LayerAccessStats {
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn clone_for_residence_change(
|
||||
&self,
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
new_status: LayerResidenceStatus,
|
||||
) -> LayerAccessStats {
|
||||
let clone = {
|
||||
@@ -205,11 +195,7 @@ impl LayerAccessStats {
|
||||
inner.clone()
|
||||
};
|
||||
let new = LayerAccessStats(Mutex::new(clone));
|
||||
new.record_residence_event(
|
||||
layer_map_lock_held_witness,
|
||||
new_status,
|
||||
LayerResidenceEventReason::ResidenceChange,
|
||||
);
|
||||
new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
|
||||
new
|
||||
}
|
||||
|
||||
@@ -229,7 +215,6 @@ impl LayerAccessStats {
|
||||
///
|
||||
pub(crate) fn record_residence_event(
|
||||
&self,
|
||||
_layer_map_lock_held_witness: &LayerManager,
|
||||
status: LayerResidenceStatus,
|
||||
reason: LayerResidenceEventReason,
|
||||
) {
|
||||
@@ -344,23 +329,6 @@ impl LayerAccessStats {
|
||||
/// are used in (timeline).
|
||||
#[async_trait::async_trait]
|
||||
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
|
||||
/// Range of keys that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
|
||||
/// Inclusive start bound of the LSN range that this layer holds
|
||||
/// Exclusive end bound of the LSN range that this layer holds.
|
||||
///
|
||||
/// - For an open in-memory layer, this is MAX_LSN.
|
||||
/// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
|
||||
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
fn get_lsn_range(&self) -> Range<Lsn>;
|
||||
|
||||
/// Does this layer only contain some data for the key-range (incremental),
|
||||
/// or does it contain a version of every page? This is important to know
|
||||
/// for garbage collecting old layers: an incremental layer depends on
|
||||
/// the previous non-incremental layer.
|
||||
fn is_incremental(&self) -> bool;
|
||||
|
||||
///
|
||||
/// Return data needed to reconstruct given page at LSN.
|
||||
///
|
||||
@@ -380,9 +348,6 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult>;
|
||||
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
@@ -467,7 +432,6 @@ pub mod tests {
|
||||
TimelineId::from_array([0; 16]),
|
||||
value.key_range,
|
||||
value.lsn,
|
||||
false,
|
||||
233,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -239,8 +239,54 @@ impl std::fmt::Debug for DeltaLayerInner {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for DeltaLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for DeltaLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for DeltaLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
self.local_path()
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
self.delete_resident_layer_file()
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
self.info(reset)
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
self.access_stats()
|
||||
}
|
||||
}
|
||||
|
||||
impl DeltaLayer {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
@@ -272,30 +318,28 @@ impl Layer for DeltaLayer {
|
||||
|
||||
tree_reader.dump().await?;
|
||||
|
||||
let keys = DeltaLayerInner::load_keys(&Ref(&**inner)).await?;
|
||||
let keys = DeltaLayerInner::load_keys(&inner).await?;
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
let dump_blob = |val: ValueRef<_>| -> _ {
|
||||
async move {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
}
|
||||
};
|
||||
async fn dump_blob(val: ValueRef<'_>) -> Result<String> {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
}
|
||||
|
||||
for entry in keys {
|
||||
let DeltaEntry { key, lsn, val, .. } = entry;
|
||||
@@ -312,7 +356,7 @@ impl Layer for DeltaLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_value_reconstruct_data(
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -331,52 +375,19 @@ impl Layer for DeltaLayer {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.layer_desc().key_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.layer_desc().lsn_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
}
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for DeltaLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for DeltaLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
pub(crate) fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.filename().file_name();
|
||||
let lsn_range = self.get_lsn_range();
|
||||
pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.layer_desc().filename().file_name();
|
||||
let lsn_range = self.layer_desc().lsn_range.clone();
|
||||
|
||||
let access_stats = self.access_stats.as_api_model(reset);
|
||||
|
||||
@@ -390,12 +401,10 @@ impl PersistentLayer for DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||
&self.access_stats
|
||||
}
|
||||
}
|
||||
|
||||
impl DeltaLayer {
|
||||
fn path_for(
|
||||
path_or_conf: &PathOrConf,
|
||||
tenant_id: &TenantId,
|
||||
@@ -541,17 +550,12 @@ impl DeltaLayer {
|
||||
/// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
|
||||
///
|
||||
/// The value can be obtained via the [`ValueRef::load`] function.
|
||||
pub(crate) async fn load_keys(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<DeltaEntry<Ref<&'_ DeltaLayerInner>>>> {
|
||||
pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::KeyIter, ctx)
|
||||
.await
|
||||
.context("load delta layer keys")?;
|
||||
|
||||
let inner = Ref(&**inner);
|
||||
DeltaLayerInner::load_keys(&inner)
|
||||
DeltaLayerInner::load_keys(inner)
|
||||
.await
|
||||
.context("Layer index is corrupted")
|
||||
}
|
||||
@@ -947,14 +951,14 @@ impl DeltaLayerInner {
|
||||
|
||||
pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
|
||||
this: &T,
|
||||
) -> Result<Vec<DeltaEntry<T>>> {
|
||||
) -> Result<Vec<DeltaEntry<'_>>> {
|
||||
let dl = this.as_ref();
|
||||
let file = &dl.file;
|
||||
|
||||
let tree_reader =
|
||||
DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
|
||||
|
||||
let mut all_keys: Vec<DeltaEntry<T>> = Vec::new();
|
||||
let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();
|
||||
|
||||
tree_reader
|
||||
.visit(
|
||||
@@ -964,7 +968,9 @@ impl DeltaLayerInner {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let val_ref = ValueRef {
|
||||
blob_ref: BlobRef(value),
|
||||
reader: BlockCursor::new(Adapter(this.clone())),
|
||||
reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
|
||||
Adapter(dl),
|
||||
)),
|
||||
};
|
||||
let pos = BlobRef(value).pos();
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
@@ -993,43 +999,23 @@ impl DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// Cloneable borrow wrapper to make borrows behave like smart pointers.
|
||||
///
|
||||
/// Shared references are trivially copyable. This wrapper avoids (confusion) to otherwise attempt
|
||||
/// cloning DeltaLayerInner.
|
||||
pub(crate) struct Ref<T>(T);
|
||||
|
||||
impl<'a, T> AsRef<T> for Ref<&'a T> {
|
||||
fn as_ref(&self) -> &T {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Clone for Ref<&'a T> {
|
||||
fn clone(&self) -> Self {
|
||||
*self
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Copy for Ref<&'a T> {}
|
||||
|
||||
/// A set of data associated with a delta layer key and its value
|
||||
pub struct DeltaEntry<T: AsRef<DeltaLayerInner>> {
|
||||
pub struct DeltaEntry<'a> {
|
||||
pub key: Key,
|
||||
pub lsn: Lsn,
|
||||
/// Size of the stored value
|
||||
pub size: u64,
|
||||
/// Reference to the on-disk value
|
||||
pub val: ValueRef<T>,
|
||||
pub val: ValueRef<'a>,
|
||||
}
|
||||
|
||||
/// Reference to an on-disk value
|
||||
pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
|
||||
pub struct ValueRef<'a> {
|
||||
blob_ref: BlobRef,
|
||||
reader: BlockCursor<Adapter<T>>,
|
||||
reader: BlockCursor<'a>,
|
||||
}
|
||||
|
||||
impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
|
||||
impl<'a> ValueRef<'a> {
|
||||
/// Loads the value from disk
|
||||
pub async fn load(&self) -> Result<Value> {
|
||||
// theoretically we *could* record an access time for each, but it does not really matter
|
||||
@@ -1039,10 +1025,10 @@ impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
|
||||
}
|
||||
}
|
||||
|
||||
struct Adapter<T: AsRef<DeltaLayerInner>>(T);
|
||||
pub(crate) struct Adapter<T>(T);
|
||||
|
||||
impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
|
||||
pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
self.0.as_ref().file.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,9 +212,20 @@ pub enum LayerFileName {
|
||||
}
|
||||
|
||||
impl LayerFileName {
|
||||
pub fn file_name(&self) -> String {
|
||||
pub(crate) fn file_name(&self) -> String {
|
||||
self.to_string()
|
||||
}
|
||||
|
||||
/// Determines if this layer file is considered to be in future meaning we will discard these
|
||||
/// layers during timeline initialization from the given disk_consistent_lsn.
|
||||
pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
|
||||
use LayerFileName::*;
|
||||
match self {
|
||||
Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
|
||||
Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for LayerFileName {
|
||||
@@ -263,8 +274,8 @@ impl serde::Serialize for LayerFileName {
|
||||
S: serde::Serializer,
|
||||
{
|
||||
match self {
|
||||
Self::Image(fname) => serializer.serialize_str(&fname.to_string()),
|
||||
Self::Delta(fname) => serializer.serialize_str(&fname.to_string()),
|
||||
Self::Image(fname) => serializer.collect_str(fname),
|
||||
Self::Delta(fname) => serializer.collect_str(fname),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,8 +169,52 @@ impl std::fmt::Debug for ImageLayerInner {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for ImageLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
/// Look up given page in the file
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for ImageLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for ImageLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for ImageLayer {
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
self.local_path()
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
self.delete_resident_layer_file()
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
self.info(reset)
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
self.access_stats()
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
@@ -178,7 +222,7 @@ impl Layer for ImageLayer {
|
||||
self.desc.key_range.start,
|
||||
self.desc.key_range.end,
|
||||
self.lsn,
|
||||
self.desc.is_incremental,
|
||||
self.desc.is_incremental(),
|
||||
self.desc.file_size
|
||||
);
|
||||
|
||||
@@ -203,8 +247,7 @@ impl Layer for ImageLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
async fn get_value_reconstruct_data(
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -225,65 +268,33 @@ impl Layer for ImageLayer {
|
||||
.with_context(|| format!("read {}", self.path().display()))
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.layer_desc().key_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.layer_desc().lsn_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for ImageLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for ImageLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for ImageLayer {
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
pub(crate) fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.filename().file_name();
|
||||
let lsn_range = self.get_lsn_range();
|
||||
pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.layer_desc().filename().file_name();
|
||||
let lsn_start = self.layer_desc().image_layer_lsn();
|
||||
|
||||
HistoricLayerInfo::Image {
|
||||
layer_file_name,
|
||||
layer_file_size: self.desc.file_size,
|
||||
lsn_start: lsn_range.start,
|
||||
lsn_start,
|
||||
remote: false,
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
}
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||
&self.access_stats
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
fn path_for(
|
||||
path_or_conf: &PathOrConf,
|
||||
timeline_id: TimelineId,
|
||||
@@ -371,7 +382,6 @@ impl ImageLayer {
|
||||
timeline_id,
|
||||
filename.key_range.clone(),
|
||||
filename.lsn,
|
||||
false,
|
||||
file_size,
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: filename.lsn,
|
||||
@@ -398,7 +408,6 @@ impl ImageLayer {
|
||||
summary.timeline_id,
|
||||
summary.key_range,
|
||||
summary.lsn,
|
||||
false,
|
||||
metadata.len(),
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: summary.lsn,
|
||||
@@ -500,7 +509,6 @@ struct ImageLayerWriterInner {
|
||||
tenant_id: TenantId,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
is_incremental: bool,
|
||||
|
||||
blob_writer: WriteBlobWriter<VirtualFile>,
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
@@ -516,7 +524,6 @@ impl ImageLayerWriterInner {
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
is_incremental: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename.
|
||||
// We'll atomically rename it to the final name when we're done.
|
||||
@@ -551,7 +558,6 @@ impl ImageLayerWriterInner {
|
||||
lsn,
|
||||
tree: tree_builder,
|
||||
blob_writer,
|
||||
is_incremental,
|
||||
};
|
||||
|
||||
Ok(writer)
|
||||
@@ -612,7 +618,6 @@ impl ImageLayerWriterInner {
|
||||
self.timeline_id,
|
||||
self.key_range.clone(),
|
||||
self.lsn,
|
||||
self.is_incremental, // for now, image layer ALWAYS covers the full range
|
||||
metadata.len(),
|
||||
);
|
||||
|
||||
@@ -687,7 +692,6 @@ impl ImageLayerWriter {
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
is_incremental: bool,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
Ok(Self {
|
||||
inner: Some(ImageLayerWriterInner::new(
|
||||
@@ -696,7 +700,6 @@ impl ImageLayerWriter {
|
||||
tenant_id,
|
||||
key_range,
|
||||
lsn,
|
||||
is_incremental,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -7,14 +7,12 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
|
||||
use crate::walrecord;
|
||||
use anyhow::{ensure, Result};
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::OnceLock;
|
||||
use tracing::*;
|
||||
@@ -32,12 +30,6 @@ use tokio::sync::RwLock;
|
||||
|
||||
use super::{DeltaLayer, DeltaLayerWriter, Layer};
|
||||
|
||||
thread_local! {
|
||||
/// A buffer for serializing object during [`InMemoryLayer::put_value`].
|
||||
/// This buffer is reused for each serialization to avoid additional malloc calls.
|
||||
static SER_BUFFER: RefCell<Vec<u8>> = RefCell::new(Vec::new());
|
||||
}
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
@@ -85,11 +77,11 @@ impl std::fmt::Debug for InMemoryLayerInner {
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
pub fn get_timeline_id(&self) -> TimelineId {
|
||||
pub(crate) fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
pub fn info(&self) -> InMemoryLayerInfo {
|
||||
pub(crate) fn info(&self) -> InMemoryLayerInfo {
|
||||
let lsn_start = self.start_lsn;
|
||||
|
||||
if let Some(&lsn_end) = self.end_lsn.get() {
|
||||
@@ -99,32 +91,22 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_writable(&self) {
|
||||
pub(crate) fn assert_writable(&self) {
|
||||
assert!(self.end_lsn.get().is_none());
|
||||
}
|
||||
|
||||
fn end_lsn_or_max(&self) -> Lsn {
|
||||
pub(crate) fn end_lsn_or_max(&self) -> Lsn {
|
||||
self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for InMemoryLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
Key::MIN..Key::MAX
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.start_lsn..self.end_lsn_or_max()
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
// in-memory layer is always considered incremental.
|
||||
true
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
///
|
||||
/// this is likely completly unused
|
||||
pub async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let end_str = self.end_lsn_or_max();
|
||||
@@ -171,7 +153,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
async fn get_value_reconstruct_data(
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -221,6 +203,20 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for InMemoryLayer {
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
self.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for InMemoryLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let end_lsn = self.end_lsn_or_max();
|
||||
@@ -234,7 +230,7 @@ impl InMemoryLayer {
|
||||
///
|
||||
pub async fn size(&self) -> Result<u64> {
|
||||
let inner = self.inner.read().await;
|
||||
Ok(inner.file.size())
|
||||
Ok(inner.file.len())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -269,17 +265,17 @@ impl InMemoryLayer {
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
let mut inner = self.inner.write().await;
|
||||
let inner: &mut _ = &mut *self.inner.write().await;
|
||||
self.assert_writable();
|
||||
|
||||
let off = {
|
||||
SER_BUFFER.with(|x| -> Result<_> {
|
||||
let mut buf = x.borrow_mut();
|
||||
buf.clear();
|
||||
val.ser_into(&mut (*buf))?;
|
||||
let off = inner.file.write_blob(&buf)?;
|
||||
Ok(off)
|
||||
})?
|
||||
// Avoid doing allocations for "small" values.
|
||||
// In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
|
||||
// https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
|
||||
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
|
||||
buf.clear();
|
||||
val.ser_into(&mut buf)?;
|
||||
inner.file.write_blob(&buf).await?
|
||||
};
|
||||
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
@@ -317,7 +313,7 @@ impl InMemoryLayer {
|
||||
/// Write this frozen in-memory layer to disk.
|
||||
///
|
||||
/// Returns a new delta layer with all the same data as this in-memory layer
|
||||
pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
|
||||
pub(crate) async fn write_to_disk(&self) -> Result<DeltaLayer> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
|
||||
@@ -19,16 +19,17 @@ use serde::{Deserialize, Serialize};
|
||||
pub struct PersistentLayerDesc {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
/// Range of keys that this layer covers
|
||||
pub key_range: Range<Key>,
|
||||
/// For image layer, this is `[lsn, lsn+1)`.
|
||||
/// Inclusive start, exclusive end of the LSN range that this layer holds.
|
||||
///
|
||||
/// - For an open in-memory layer, the end bound is MAX_LSN
|
||||
/// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
|
||||
/// range start
|
||||
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
pub lsn_range: Range<Lsn>,
|
||||
/// Whether this is a delta layer.
|
||||
/// Whether this is a delta layer, and also, is this incremental.
|
||||
pub is_delta: bool,
|
||||
/// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should
|
||||
/// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
|
||||
/// incremental.
|
||||
pub is_incremental: bool,
|
||||
/// File size
|
||||
pub file_size: u64,
|
||||
}
|
||||
|
||||
@@ -61,7 +62,6 @@ impl PersistentLayerDesc {
|
||||
key_range,
|
||||
lsn_range: Lsn(0)..Lsn(1),
|
||||
is_delta: false,
|
||||
is_incremental: false,
|
||||
file_size: 0,
|
||||
}
|
||||
}
|
||||
@@ -71,7 +71,6 @@ impl PersistentLayerDesc {
|
||||
timeline_id: TimelineId,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
is_incremental: bool,
|
||||
file_size: u64,
|
||||
) -> Self {
|
||||
Self {
|
||||
@@ -80,7 +79,6 @@ impl PersistentLayerDesc {
|
||||
key_range,
|
||||
lsn_range: Self::image_layer_lsn_range(lsn),
|
||||
is_delta: false,
|
||||
is_incremental,
|
||||
file_size,
|
||||
}
|
||||
}
|
||||
@@ -98,7 +96,6 @@ impl PersistentLayerDesc {
|
||||
key_range,
|
||||
lsn_range,
|
||||
is_delta: true,
|
||||
is_incremental: true,
|
||||
file_size,
|
||||
}
|
||||
}
|
||||
@@ -164,8 +161,12 @@ impl PersistentLayerDesc {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
/// Does this layer only contain some data for the key-range (incremental),
|
||||
/// or does it contain a version of every page? This is important to know
|
||||
/// for garbage collecting old layers: an incremental layer depends on
|
||||
/// the previous non-incremental layer.
|
||||
pub fn is_incremental(&self) -> bool {
|
||||
self.is_incremental
|
||||
self.is_delta
|
||||
}
|
||||
|
||||
pub fn is_delta(&self) -> bool {
|
||||
@@ -182,7 +183,7 @@ impl PersistentLayerDesc {
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end,
|
||||
self.is_delta,
|
||||
self.is_incremental,
|
||||
self.is_incremental(),
|
||||
self.file_size,
|
||||
);
|
||||
|
||||
|
||||
@@ -60,7 +60,7 @@ impl std::fmt::Debug for RemoteLayer {
|
||||
f.debug_struct("RemoteLayer")
|
||||
.field("file_name", &self.desc.filename())
|
||||
.field("layer_metadata", &self.layer_metadata)
|
||||
.field("is_incremental", &self.desc.is_incremental)
|
||||
.field("is_incremental", &self.desc.is_incremental())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -76,39 +76,6 @@ impl Layer for RemoteLayer {
|
||||
) -> Result<ValueReconstructResult> {
|
||||
bail!("layer {self} needs to be downloaded");
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
self.desc.timeline_id,
|
||||
self.desc.key_range.start,
|
||||
self.desc.key_range.end,
|
||||
self.desc.lsn_range.start,
|
||||
self.desc.lsn_range.end,
|
||||
self.desc.is_delta,
|
||||
self.desc.is_incremental,
|
||||
self.desc.file_size,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.layer_desc().key_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.layer_desc().lsn_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
@@ -142,8 +109,8 @@ impl PersistentLayer for RemoteLayer {
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.filename().file_name();
|
||||
let lsn_range = self.get_lsn_range();
|
||||
let layer_file_name = self.layer_desc().filename().file_name();
|
||||
let lsn_range = self.layer_desc().lsn_range.clone();
|
||||
|
||||
if self.desc.is_delta {
|
||||
HistoricLayerInfo::Delta {
|
||||
@@ -184,7 +151,6 @@ impl RemoteLayer {
|
||||
timelineid,
|
||||
fname.key_range.clone(),
|
||||
fname.lsn,
|
||||
false,
|
||||
layer_metadata.file_size(),
|
||||
),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
@@ -217,9 +183,9 @@ impl RemoteLayer {
|
||||
}
|
||||
|
||||
/// Create a Layer struct representing this layer, after it has been downloaded.
|
||||
pub fn create_downloaded_layer(
|
||||
pub(crate) fn create_downloaded_layer(
|
||||
&self,
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
_layer_map_lock_held_witness: &LayerManager,
|
||||
conf: &'static PageServerConf,
|
||||
file_size: u64,
|
||||
) -> Arc<dyn PersistentLayer> {
|
||||
@@ -231,10 +197,8 @@ impl RemoteLayer {
|
||||
self.desc.tenant_id,
|
||||
&fname,
|
||||
file_size,
|
||||
self.access_stats.clone_for_residence_change(
|
||||
layer_map_lock_held_witness,
|
||||
LayerResidenceStatus::Resident,
|
||||
),
|
||||
self.access_stats
|
||||
.clone_for_residence_change(LayerResidenceStatus::Resident),
|
||||
))
|
||||
} else {
|
||||
let fname = self.desc.image_file_name();
|
||||
@@ -244,10 +208,8 @@ impl RemoteLayer {
|
||||
self.desc.tenant_id,
|
||||
&fname,
|
||||
file_size,
|
||||
self.access_stats.clone_for_residence_change(
|
||||
layer_map_lock_held_witness,
|
||||
LayerResidenceStatus::Resident,
|
||||
),
|
||||
self.access_stats
|
||||
.clone_for_residence_change(LayerResidenceStatus::Resident),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
pub mod delete;
|
||||
mod eviction_task;
|
||||
mod init;
|
||||
pub mod layer_manager;
|
||||
mod logical_size;
|
||||
pub mod span;
|
||||
@@ -27,7 +28,6 @@ use utils::id::TenantTimelineId;
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::fs;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::pin::pin;
|
||||
@@ -38,15 +38,13 @@ use std::time::{Duration, Instant, SystemTime};
|
||||
use crate::context::{
|
||||
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
|
||||
};
|
||||
use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
|
||||
use crate::tenant::storage_layer::{
|
||||
DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
|
||||
LayerAccessStats, LayerFileName, RemoteLayer,
|
||||
DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
|
||||
};
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
use crate::tenant::{
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::{save_metadata, TimelineMetadata},
|
||||
par_fsync,
|
||||
@@ -78,11 +76,10 @@ use utils::{
|
||||
use crate::page_cache;
|
||||
use crate::repository::GcResult;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::METADATA_FILE_NAME;
|
||||
use crate::ZERO_PAGE;
|
||||
use crate::{is_temporary, task_mgr};
|
||||
|
||||
use self::delete::DeleteTimelineFlow;
|
||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||
@@ -95,7 +92,7 @@ use super::config::TenantConf;
|
||||
use super::remote_timeline_client::index::IndexPart;
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
use super::storage_layer::{
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc,
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStatsReset, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
@@ -140,6 +137,12 @@ fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
|
||||
fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
drop(rlock)
|
||||
}
|
||||
|
||||
/// The outward-facing resources required to build a Timeline
|
||||
pub struct TimelineResources {
|
||||
pub remote_client: Option<RemoteTimelineClient>,
|
||||
}
|
||||
|
||||
pub struct Timeline {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
@@ -1205,7 +1208,7 @@ impl Timeline {
|
||||
&layer_metadata,
|
||||
local_layer
|
||||
.access_stats()
|
||||
.clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
|
||||
.clone_for_residence_change(LayerResidenceStatus::Evicted),
|
||||
),
|
||||
LayerFileName::Delta(delta_name) => RemoteLayer::new_delta(
|
||||
self.tenant_id,
|
||||
@@ -1214,7 +1217,7 @@ impl Timeline {
|
||||
&layer_metadata,
|
||||
local_layer
|
||||
.access_stats()
|
||||
.clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
|
||||
.clone_for_residence_change(LayerResidenceStatus::Evicted),
|
||||
),
|
||||
});
|
||||
|
||||
@@ -1374,7 +1377,7 @@ impl Timeline {
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
resources: TimelineResources,
|
||||
pg_version: u32,
|
||||
initial_logical_size_can_start: Option<completion::Barrier>,
|
||||
initial_logical_size_attempt: Option<completion::Completion>,
|
||||
@@ -1409,7 +1412,7 @@ impl Timeline {
|
||||
walredo_mgr,
|
||||
walreceiver: Mutex::new(None),
|
||||
|
||||
remote_client: remote_client.map(Arc::new),
|
||||
remote_client: resources.remote_client.map(Arc::new),
|
||||
|
||||
// initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
|
||||
last_record_lsn: SeqWait::new(RecordLsn {
|
||||
@@ -1512,7 +1515,7 @@ impl Timeline {
|
||||
let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
|
||||
let self_clone = Arc::clone(self);
|
||||
|
||||
info!("spawning flush loop");
|
||||
debug!("spawning flush loop");
|
||||
*flush_loop_state = FlushLoopState::Running {
|
||||
#[cfg(test)]
|
||||
expect_initdb_optimization: false,
|
||||
@@ -1583,9 +1586,7 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
///
|
||||
/// Initialize with an empty layer map. Used when creating a new timeline.
|
||||
///
|
||||
pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
|
||||
let mut layers = self.layers.try_write().expect(
|
||||
"in the context where we call this function, no other task has access to the object",
|
||||
@@ -1593,10 +1594,16 @@ impl Timeline {
|
||||
layers.initialize_empty(Lsn(start_lsn.0));
|
||||
}
|
||||
|
||||
///
|
||||
/// Scan the timeline directory to populate the layer map.
|
||||
///
|
||||
pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
||||
/// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only
|
||||
/// files.
|
||||
pub(super) async fn load_layer_map(
|
||||
&self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
index_part: Option<IndexPart>,
|
||||
) -> anyhow::Result<()> {
|
||||
use init::{Decision::*, Discovered, FutureLayer};
|
||||
use LayerFileName::*;
|
||||
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
let timer = self.metrics.load_layer_map_histo.start_timer();
|
||||
@@ -1604,102 +1611,153 @@ impl Timeline {
|
||||
// Scan timeline directory and create ImageFileName and DeltaFilename
|
||||
// structs representing all files on disk
|
||||
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||
// total size of layer files in the current timeline directory
|
||||
let mut total_physical_size = 0;
|
||||
let (conf, tenant_id, timeline_id) = (self.conf, self.tenant_id, self.timeline_id);
|
||||
let span = tracing::Span::current();
|
||||
|
||||
let mut loaded_layers = Vec::<Arc<dyn PersistentLayer>>::new();
|
||||
let (loaded_layers, to_sync, total_physical_size) = tokio::task::spawn_blocking({
|
||||
move || {
|
||||
let _g = span.entered();
|
||||
let discovered = init::scan_timeline_dir(&timeline_path)?;
|
||||
let mut discovered_layers = Vec::with_capacity(discovered.len());
|
||||
let mut unrecognized_files = Vec::new();
|
||||
|
||||
for direntry in fs::read_dir(timeline_path)? {
|
||||
let direntry = direntry?;
|
||||
let direntry_path = direntry.path();
|
||||
let fname = direntry.file_name();
|
||||
let fname = fname.to_string_lossy();
|
||||
let mut path = timeline_path;
|
||||
|
||||
if let Some(filename) = ImageFileName::parse_str(&fname) {
|
||||
// create an ImageLayer struct for each image file.
|
||||
if filename.lsn > disk_consistent_lsn {
|
||||
info!(
|
||||
"found future image layer {} on timeline {} disk_consistent_lsn is {}",
|
||||
filename, self.timeline_id, disk_consistent_lsn
|
||||
);
|
||||
|
||||
rename_to_backup(&direntry_path)?;
|
||||
continue;
|
||||
for discovered in discovered {
|
||||
let (name, kind) = match discovered {
|
||||
Discovered::Layer(file_name, file_size) => {
|
||||
discovered_layers.push((file_name, file_size));
|
||||
continue;
|
||||
}
|
||||
Discovered::Metadata | Discovered::IgnoredBackup => {
|
||||
continue;
|
||||
}
|
||||
Discovered::Unknown(file_name) => {
|
||||
// we will later error if there are any
|
||||
unrecognized_files.push(file_name);
|
||||
continue;
|
||||
}
|
||||
Discovered::Ephemeral(name) => (name, "old ephemeral file"),
|
||||
Discovered::Temporary(name) => (name, "temporary timeline file"),
|
||||
Discovered::TemporaryDownload(name) => (name, "temporary download"),
|
||||
};
|
||||
path.push(name);
|
||||
init::cleanup(&path, kind)?;
|
||||
path.pop();
|
||||
}
|
||||
|
||||
let file_size = direntry_path.metadata()?.len();
|
||||
let stats =
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
|
||||
|
||||
let layer = ImageLayer::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&filename,
|
||||
file_size,
|
||||
stats,
|
||||
);
|
||||
|
||||
total_physical_size += file_size;
|
||||
loaded_layers.push(Arc::new(layer));
|
||||
} else if let Some(filename) = DeltaFileName::parse_str(&fname) {
|
||||
// Create a DeltaLayer struct for each delta file.
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
// OK for a delta layer to have end LSN 101, but if the end LSN
|
||||
// is 102, then it might not have been fully flushed to disk
|
||||
// before crash.
|
||||
if filename.lsn_range.end > disk_consistent_lsn + 1 {
|
||||
info!(
|
||||
"found future delta layer {} on timeline {} disk_consistent_lsn is {}",
|
||||
filename, self.timeline_id, disk_consistent_lsn
|
||||
if !unrecognized_files.is_empty() {
|
||||
// assume that if there are any there are many many.
|
||||
let n = unrecognized_files.len();
|
||||
let first = &unrecognized_files[..n.min(10)];
|
||||
anyhow::bail!(
|
||||
"unrecognized files in timeline dir (total {n}), first 10: {first:?}"
|
||||
);
|
||||
|
||||
rename_to_backup(&direntry_path)?;
|
||||
continue;
|
||||
}
|
||||
|
||||
let file_size = direntry_path.metadata()?.len();
|
||||
let stats =
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
|
||||
let decided =
|
||||
init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);
|
||||
|
||||
let layer = DeltaLayer::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&filename,
|
||||
file_size,
|
||||
stats,
|
||||
);
|
||||
let mut loaded_layers = Vec::new();
|
||||
let mut needs_upload = Vec::new();
|
||||
let mut needs_cleanup = Vec::new();
|
||||
let mut total_physical_size = 0;
|
||||
|
||||
total_physical_size += file_size;
|
||||
loaded_layers.push(Arc::new(layer));
|
||||
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
|
||||
// ignore these
|
||||
} else if remote_timeline_client::is_temp_download_file(&direntry_path) {
|
||||
info!(
|
||||
"skipping temp download file, reconcile_with_remote will resume / clean up: {}",
|
||||
fname
|
||||
);
|
||||
} else if is_ephemeral_file(&fname) {
|
||||
// Delete any old ephemeral files
|
||||
trace!("deleting old ephemeral file in timeline dir: {}", fname);
|
||||
fs::remove_file(&direntry_path)?;
|
||||
} else if is_temporary(&direntry_path) {
|
||||
info!("removing temp timeline file at {}", direntry_path.display());
|
||||
fs::remove_file(&direntry_path).with_context(|| {
|
||||
format!(
|
||||
"failed to remove temp download file at {}",
|
||||
direntry_path.display()
|
||||
)
|
||||
})?;
|
||||
} else {
|
||||
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||
for (name, decision) in decided {
|
||||
let decision = match decision {
|
||||
Ok(UseRemote { local, remote }) => {
|
||||
path.push(name.file_name());
|
||||
init::cleanup_local_file_for_remote(&path, &local, &remote)?;
|
||||
path.pop();
|
||||
|
||||
UseRemote { local, remote }
|
||||
}
|
||||
Ok(decision) => decision,
|
||||
Err(FutureLayer { local }) => {
|
||||
if local.is_some() {
|
||||
path.push(name.file_name());
|
||||
init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
|
||||
path.pop();
|
||||
}
|
||||
needs_cleanup.push(name);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match &name {
|
||||
Delta(d) => assert!(d.lsn_range.end <= disk_consistent_lsn + 1),
|
||||
Image(i) => assert!(i.lsn <= disk_consistent_lsn),
|
||||
}
|
||||
|
||||
let status = match &decision {
|
||||
UseLocal(_) | NeedsUpload(_) => LayerResidenceStatus::Resident,
|
||||
Evicted(_) | UseRemote { .. } => LayerResidenceStatus::Evicted,
|
||||
};
|
||||
|
||||
let stats = LayerAccessStats::for_loading_layer(status);
|
||||
|
||||
let layer: Arc<dyn PersistentLayer> = match (name, &decision) {
|
||||
(Delta(d), UseLocal(m) | NeedsUpload(m)) => {
|
||||
total_physical_size += m.file_size();
|
||||
Arc::new(DeltaLayer::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
&d,
|
||||
m.file_size(),
|
||||
stats,
|
||||
))
|
||||
}
|
||||
(Image(i), UseLocal(m) | NeedsUpload(m)) => {
|
||||
total_physical_size += m.file_size();
|
||||
Arc::new(ImageLayer::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
&i,
|
||||
m.file_size(),
|
||||
stats,
|
||||
))
|
||||
}
|
||||
(Delta(d), Evicted(remote) | UseRemote { remote, .. }) => Arc::new(
|
||||
RemoteLayer::new_delta(tenant_id, timeline_id, &d, remote, stats),
|
||||
),
|
||||
(Image(i), Evicted(remote) | UseRemote { remote, .. }) => Arc::new(
|
||||
RemoteLayer::new_img(tenant_id, timeline_id, &i, remote, stats),
|
||||
),
|
||||
};
|
||||
|
||||
if let NeedsUpload(m) = decision {
|
||||
needs_upload.push((layer.clone(), m));
|
||||
}
|
||||
|
||||
loaded_layers.push(layer);
|
||||
}
|
||||
Ok((
|
||||
loaded_layers,
|
||||
(needs_upload, needs_cleanup),
|
||||
total_physical_size,
|
||||
))
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(anyhow::Error::new)
|
||||
.and_then(|x| x)?;
|
||||
|
||||
let num_layers = loaded_layers.len();
|
||||
guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1);
|
||||
|
||||
guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
|
||||
|
||||
if let Some(rtc) = self.remote_client.as_ref() {
|
||||
let (needs_upload, needs_cleanup) = to_sync;
|
||||
for (layer, m) in needs_upload {
|
||||
rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
|
||||
}
|
||||
rtc.schedule_layer_file_deletion(&needs_cleanup)?;
|
||||
rtc.schedule_index_upload_for_file_changes()?;
|
||||
// Tenant::create_timeline will wait for these uploads to happen before returning, or
|
||||
// on retry.
|
||||
}
|
||||
|
||||
info!(
|
||||
"loaded layer map with {} layers at {}, total physical size: {}",
|
||||
@@ -1710,236 +1768,6 @@ impl Timeline {
|
||||
.set(total_physical_size);
|
||||
|
||||
timer.stop_and_record();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_remote_layers(
|
||||
&self,
|
||||
index_part: &IndexPart,
|
||||
local_layers: HashMap<LayerFileName, Arc<dyn PersistentLayer>>,
|
||||
up_to_date_disk_consistent_lsn: Lsn,
|
||||
) -> anyhow::Result<HashMap<LayerFileName, Arc<dyn PersistentLayer>>> {
|
||||
// Are we missing some files that are present in remote storage?
|
||||
// Create RemoteLayer instances for them.
|
||||
let mut local_only_layers = local_layers;
|
||||
|
||||
// We're holding a layer map lock for a while but this
|
||||
// method is only called during init so it's fine.
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
let mut corrupted_local_layers = Vec::new();
|
||||
let mut added_remote_layers = Vec::new();
|
||||
for remote_layer_name in &index_part.timeline_layers {
|
||||
let local_layer = local_only_layers.remove(remote_layer_name);
|
||||
|
||||
let remote_layer_metadata = index_part
|
||||
.layer_metadata
|
||||
.get(remote_layer_name)
|
||||
.map(LayerFileMetadata::from)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"No remote layer metadata found for layer {}",
|
||||
remote_layer_name.file_name()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Is the local layer's size different from the size stored in the
|
||||
// remote index file?
|
||||
// If so, rename_to_backup those files & replace their local layer with
|
||||
// a RemoteLayer in the layer map so that we re-download them on-demand.
|
||||
if let Some(local_layer) = local_layer {
|
||||
let local_layer_path = local_layer
|
||||
.local_path()
|
||||
.expect("caller must ensure that local_layers only contains local layers");
|
||||
ensure!(
|
||||
local_layer_path.exists(),
|
||||
"every layer from local_layers must exist on disk: {}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
|
||||
let remote_size = remote_layer_metadata.file_size();
|
||||
let metadata = local_layer_path.metadata().with_context(|| {
|
||||
format!(
|
||||
"get file size of local layer {}",
|
||||
local_layer_path.display()
|
||||
)
|
||||
})?;
|
||||
let local_size = metadata.len();
|
||||
if local_size != remote_size {
|
||||
warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
|
||||
if let Err(err) = rename_to_backup(&local_layer_path) {
|
||||
assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
|
||||
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
||||
} else {
|
||||
self.metrics.resident_physical_size_gauge.sub(local_size);
|
||||
corrupted_local_layers.push(local_layer);
|
||||
// fall-through to adding the remote layer
|
||||
}
|
||||
} else {
|
||||
debug!(
|
||||
"layer is present locally and file size matches remote, using it: {}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"remote layer does not exist locally, creating remote layer: {}",
|
||||
remote_layer_name.file_name()
|
||||
);
|
||||
|
||||
match remote_layer_name {
|
||||
LayerFileName::Image(imgfilename) => {
|
||||
if imgfilename.lsn > up_to_date_disk_consistent_lsn {
|
||||
info!(
|
||||
"found future image layer {} on timeline {} remote_consistent_lsn is {}",
|
||||
imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let stats =
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
|
||||
|
||||
let remote_layer = RemoteLayer::new_img(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
imgfilename,
|
||||
&remote_layer_metadata,
|
||||
stats,
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
added_remote_layers.push(remote_layer);
|
||||
}
|
||||
LayerFileName::Delta(deltafilename) => {
|
||||
// Create a RemoteLayer for the delta file.
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
// OK for a delta layer to have end LSN 101, but if the end LSN
|
||||
// is 102, then it might not have been fully flushed to disk
|
||||
// before crash.
|
||||
if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
|
||||
info!(
|
||||
"found future delta layer {} on timeline {} remote_consistent_lsn is {}",
|
||||
deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let stats =
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
|
||||
|
||||
let remote_layer = RemoteLayer::new_delta(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
deltafilename,
|
||||
&remote_layer_metadata,
|
||||
stats,
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
added_remote_layers.push(remote_layer);
|
||||
}
|
||||
}
|
||||
}
|
||||
guard.initialize_remote_layers(corrupted_local_layers, added_remote_layers);
|
||||
Ok(local_only_layers)
|
||||
}
|
||||
|
||||
/// This function will synchronize local state with what we have in remote storage.
|
||||
///
|
||||
/// Steps taken:
|
||||
/// 1. Initialize upload queue based on `index_part`.
|
||||
/// 2. Create `RemoteLayer` instances for layers that exist only on the remote.
|
||||
/// The list of layers on the remote comes from `index_part`.
|
||||
/// The list of local layers is given by the layer map's `iter_historic_layers()`.
|
||||
/// So, the layer map must have been loaded already.
|
||||
/// 3. Schedule upload of local-only layer files (which will then also update the remote
|
||||
/// IndexPart to include the new layer files).
|
||||
///
|
||||
/// Refer to the [`remote_timeline_client`] module comment for more context.
|
||||
///
|
||||
/// # TODO
|
||||
/// May be a bit cleaner to do things based on populated remote client,
|
||||
/// and then do things based on its upload_queue.latest_files.
|
||||
#[instrument(skip(self, index_part, up_to_date_metadata))]
|
||||
pub async fn reconcile_with_remote(
|
||||
&self,
|
||||
up_to_date_metadata: &TimelineMetadata,
|
||||
index_part: Option<&IndexPart>,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("starting");
|
||||
let remote_client = self
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("cannot download without remote storage"))?;
|
||||
|
||||
let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn();
|
||||
|
||||
let local_layers = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
layers
|
||||
.iter_historic_layers()
|
||||
.map(|l| (l.filename(), guard.get_from_desc(&l)))
|
||||
.collect::<HashMap<_, _>>()
|
||||
};
|
||||
|
||||
// If no writes happen, new branches do not have any layers, only the metadata file.
|
||||
let has_local_layers = !local_layers.is_empty();
|
||||
let local_only_layers = match index_part {
|
||||
Some(index_part) => {
|
||||
info!(
|
||||
"initializing upload queue from remote index with {} layer files",
|
||||
index_part.timeline_layers.len()
|
||||
);
|
||||
remote_client.init_upload_queue(index_part)?;
|
||||
self.create_remote_layers(index_part, local_layers, disk_consistent_lsn)
|
||||
.await?
|
||||
}
|
||||
None => {
|
||||
info!("initializing upload queue as empty");
|
||||
remote_client.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
|
||||
local_layers
|
||||
}
|
||||
};
|
||||
|
||||
if has_local_layers {
|
||||
// Are there local files that don't exist remotely? Schedule uploads for them.
|
||||
// Local timeline metadata will get uploaded to remove along witht he layers.
|
||||
for (layer_name, layer) in &local_only_layers {
|
||||
// XXX solve this in the type system
|
||||
let layer_path = layer
|
||||
.local_path()
|
||||
.expect("local_only_layers only contains local layers");
|
||||
let layer_size = layer_path
|
||||
.metadata()
|
||||
.with_context(|| format!("failed to get file {layer_path:?} metadata"))?
|
||||
.len();
|
||||
info!("scheduling {layer_path:?} for upload");
|
||||
remote_client
|
||||
.schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
|
||||
}
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
} else if index_part.is_none() {
|
||||
// No data on the remote storage, no local layers, local metadata file.
|
||||
//
|
||||
// TODO https://github.com/neondatabase/neon/issues/3865
|
||||
// Currently, console does not wait for the timeline data upload to the remote storage
|
||||
// and considers the timeline created, expecting other pageserver nodes to work with it.
|
||||
// Branch metadata upload could get interrupted (e.g pageserver got killed),
|
||||
// hence any locally existing branch metadata with no remote counterpart should be uploaded,
|
||||
// otherwise any other pageserver won't see the branch on `attach`.
|
||||
//
|
||||
// After the issue gets implemented, pageserver should rather remove the branch,
|
||||
// since absence on S3 means we did not acknowledge the branch creation and console will have to retry,
|
||||
// no need to keep the old files.
|
||||
remote_client.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
|
||||
} else {
|
||||
// Local timeline has a metadata file, remote one too, both have no layers to sync.
|
||||
}
|
||||
|
||||
info!("Done");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2846,7 +2674,6 @@ impl Timeline {
|
||||
if let Some(ref l) = delta_layer_to_add {
|
||||
// TODO: move access stats, metrics update, etc. into layer manager.
|
||||
l.access_stats().record_residence_event(
|
||||
&guard,
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
@@ -3143,7 +2970,6 @@ impl Timeline {
|
||||
self.tenant_id,
|
||||
&img_range,
|
||||
lsn,
|
||||
false, // image layer always covers the full range
|
||||
)?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
@@ -3236,7 +3062,6 @@ impl Timeline {
|
||||
.add(metadata.len());
|
||||
let l = Arc::new(l);
|
||||
l.access_stats().record_residence_event(
|
||||
&guard,
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
@@ -3404,8 +3229,8 @@ impl Timeline {
|
||||
/// start of level0 files compaction, the on-demand download should be revisited as well.
|
||||
///
|
||||
/// [`compact_inner`]: Self::compact_inner
|
||||
fn compact_level0_phase1(
|
||||
self: Arc<Self>,
|
||||
async fn compact_level0_phase1(
|
||||
self: &Arc<Self>,
|
||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
|
||||
mut stats: CompactLevel0Phase1StatsBuilder,
|
||||
@@ -3556,7 +3381,7 @@ impl Timeline {
|
||||
.collect();
|
||||
for dl in downcast_deltas.iter() {
|
||||
// TODO: replace this with an await once we fully go async
|
||||
all_keys.extend(Handle::current().block_on(DeltaLayer::load_keys(dl, ctx))?);
|
||||
all_keys.extend(DeltaLayer::load_keys(dl, ctx).await?);
|
||||
}
|
||||
|
||||
// The current stdlib sorting implementation is designed in a way where it is
|
||||
@@ -3670,107 +3495,103 @@ impl Timeline {
|
||||
let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
|
||||
let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
|
||||
|
||||
// TODO remove this block_on wrapper once we fully go async
|
||||
Handle::current().block_on(async {
|
||||
for &DeltaEntry {
|
||||
key, lsn, ref val, ..
|
||||
} in all_values_iter
|
||||
{
|
||||
let value = val.load().await?;
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
let mut next_key_size = 0u64;
|
||||
let is_dup_layer = dup_end_lsn.is_valid();
|
||||
dup_start_lsn = Lsn::INVALID;
|
||||
if !same_key {
|
||||
dup_end_lsn = Lsn::INVALID;
|
||||
}
|
||||
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
|
||||
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
|
||||
next_key_size = next_size;
|
||||
if key != next_key {
|
||||
if dup_end_lsn.is_valid() {
|
||||
// We are writting segment with duplicates:
|
||||
// place all remaining values of this key in separate segment
|
||||
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
|
||||
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
|
||||
}
|
||||
break;
|
||||
}
|
||||
key_values_total_size += next_size;
|
||||
// Check if it is time to split segment: if total keys size is larger than target file size.
|
||||
// We need to avoid generation of empty segments if next_size > target_file_size.
|
||||
if key_values_total_size > target_file_size && lsn != next_lsn {
|
||||
// Split key between multiple layers: such layer can contain only single key
|
||||
dup_start_lsn = if dup_end_lsn.is_valid() {
|
||||
dup_end_lsn // new segment with duplicates starts where old one stops
|
||||
} else {
|
||||
lsn // start with the first LSN for this key
|
||||
};
|
||||
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
|
||||
break;
|
||||
}
|
||||
}
|
||||
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
|
||||
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
|
||||
dup_start_lsn = dup_end_lsn;
|
||||
dup_end_lsn = lsn_range.end;
|
||||
}
|
||||
if writer.is_some() {
|
||||
let written_size = writer.as_mut().unwrap().size();
|
||||
let contains_hole =
|
||||
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
|
||||
// check if key cause layer overflow or contains hole...
|
||||
if is_dup_layer
|
||||
|| dup_end_lsn.is_valid()
|
||||
|| written_size + key_values_total_size > target_file_size
|
||||
|| contains_hole
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(Arc::new(
|
||||
writer.take().unwrap().finish(prev_key.unwrap().next())?,
|
||||
));
|
||||
writer = None;
|
||||
|
||||
if contains_hole {
|
||||
// skip hole
|
||||
next_hole += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remember size of key value because at next iteration we will access next item
|
||||
key_values_total_size = next_key_size;
|
||||
for &DeltaEntry {
|
||||
key, lsn, ref val, ..
|
||||
} in all_values_iter
|
||||
{
|
||||
let value = val.load().await?;
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
let mut next_key_size = 0u64;
|
||||
let is_dup_layer = dup_end_lsn.is_valid();
|
||||
dup_start_lsn = Lsn::INVALID;
|
||||
if !same_key {
|
||||
dup_end_lsn = Lsn::INVALID;
|
||||
}
|
||||
if writer.is_none() {
|
||||
// Create writer if not initiaized yet
|
||||
writer = Some(DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
key,
|
||||
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
|
||||
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
|
||||
next_key_size = next_size;
|
||||
if key != next_key {
|
||||
if dup_end_lsn.is_valid() {
|
||||
// this is a layer containing slice of values of the same key
|
||||
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
|
||||
dup_start_lsn..dup_end_lsn
|
||||
// We are writting segment with duplicates:
|
||||
// place all remaining values of this key in separate segment
|
||||
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
|
||||
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
|
||||
}
|
||||
break;
|
||||
}
|
||||
key_values_total_size += next_size;
|
||||
// Check if it is time to split segment: if total keys size is larger than target file size.
|
||||
// We need to avoid generation of empty segments if next_size > target_file_size.
|
||||
if key_values_total_size > target_file_size && lsn != next_lsn {
|
||||
// Split key between multiple layers: such layer can contain only single key
|
||||
dup_start_lsn = if dup_end_lsn.is_valid() {
|
||||
dup_end_lsn // new segment with duplicates starts where old one stops
|
||||
} else {
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
lsn_range.clone()
|
||||
},
|
||||
)?);
|
||||
lsn // start with the first LSN for this key
|
||||
};
|
||||
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
|
||||
break;
|
||||
}
|
||||
}
|
||||
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
|
||||
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
|
||||
dup_start_lsn = dup_end_lsn;
|
||||
dup_end_lsn = lsn_range.end;
|
||||
}
|
||||
if writer.is_some() {
|
||||
let written_size = writer.as_mut().unwrap().size();
|
||||
let contains_hole =
|
||||
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
|
||||
// check if key cause layer overflow or contains hole...
|
||||
if is_dup_layer
|
||||
|| dup_end_lsn.is_valid()
|
||||
|| written_size + key_values_total_size > target_file_size
|
||||
|| contains_hole
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(Arc::new(
|
||||
writer.take().unwrap().finish(prev_key.unwrap().next())?,
|
||||
));
|
||||
writer = None;
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
Result::<_>::Err(anyhow::anyhow!(
|
||||
"failpoint delta-layer-writer-fail-before-finish"
|
||||
))
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
if contains_hole {
|
||||
// skip hole
|
||||
next_hole += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remember size of key value because at next iteration we will access next item
|
||||
key_values_total_size = next_key_size;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
if writer.is_none() {
|
||||
// Create writer if not initiaized yet
|
||||
writer = Some(DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
key,
|
||||
if dup_end_lsn.is_valid() {
|
||||
// this is a layer containing slice of values of the same key
|
||||
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
|
||||
dup_start_lsn..dup_end_lsn
|
||||
} else {
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
lsn_range.clone()
|
||||
},
|
||||
)?);
|
||||
}
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
Err(CompactionError::Other(anyhow::anyhow!(
|
||||
"failpoint delta-layer-writer-fail-before-finish"
|
||||
)))
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
}
|
||||
if let Some(writer) = writer {
|
||||
new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
|
||||
}
|
||||
@@ -3783,10 +3604,10 @@ impl Timeline {
|
||||
// we still might easily hit the limit otherwise.
|
||||
let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
|
||||
for layer in new_layers.iter() {
|
||||
if layer.desc.file_size > warn_limit {
|
||||
if layer.layer_desc().file_size > warn_limit {
|
||||
warn!(
|
||||
%layer,
|
||||
"created delta file of size {} larger than double of target of {target_file_size}", layer.desc.file_size
|
||||
"created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -3804,7 +3625,7 @@ impl Timeline {
|
||||
|
||||
stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
|
||||
stats.new_deltas_count = Some(new_layers.len());
|
||||
stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum());
|
||||
stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
|
||||
|
||||
match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
|
||||
.and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
|
||||
@@ -3844,8 +3665,7 @@ impl Timeline {
|
||||
deltas_to_compact,
|
||||
} = {
|
||||
let phase1_span = info_span!("compact_level0_phase1");
|
||||
let myself = Arc::clone(self);
|
||||
let ctx = ctx.attached_child(); // technically, the spawn_blocking can outlive this future
|
||||
let ctx = ctx.attached_child();
|
||||
let mut stats = CompactLevel0Phase1StatsBuilder {
|
||||
version: Some(2),
|
||||
tenant_id: Some(self.tenant_id),
|
||||
@@ -3859,18 +3679,15 @@ impl Timeline {
|
||||
stats.read_lock_acquisition_micros =
|
||||
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
|
||||
let layer_removal_cs = layer_removal_cs.clone();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _entered = phase1_span.enter();
|
||||
myself.compact_level0_phase1(
|
||||
layer_removal_cs,
|
||||
phase1_layers_locked,
|
||||
stats,
|
||||
target_file_size,
|
||||
&ctx,
|
||||
)
|
||||
})
|
||||
.await
|
||||
.context("spawn_blocking")??
|
||||
self.compact_level0_phase1(
|
||||
layer_removal_cs,
|
||||
phase1_layers_locked,
|
||||
stats,
|
||||
target_file_size,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(phase1_span)
|
||||
.await?
|
||||
};
|
||||
|
||||
if new_layers.is_empty() && deltas_to_compact.is_empty() {
|
||||
@@ -3924,7 +3741,6 @@ impl Timeline {
|
||||
|
||||
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
|
||||
l.access_stats().record_residence_event(
|
||||
&guard,
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
@@ -4843,7 +4659,8 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
|
||||
for i in 0u32.. {
|
||||
new_path.set_file_name(format!("{filename}.{i}.old"));
|
||||
if !new_path.exists() {
|
||||
std::fs::rename(path, &new_path)?;
|
||||
std::fs::rename(path, &new_path)
|
||||
.with_context(|| format!("rename {path:?} to {new_path:?}"))?;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ use crate::{
|
||||
InitializationOrder,
|
||||
};
|
||||
|
||||
use super::Timeline;
|
||||
use super::{Timeline, TimelineResources};
|
||||
|
||||
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
||||
@@ -416,7 +416,7 @@ impl DeleteTimelineFlow {
|
||||
timeline_id,
|
||||
local_metadata,
|
||||
None, // Ancestor is not needed for deletion.
|
||||
remote_client,
|
||||
TimelineResources { remote_client },
|
||||
init_order,
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
|
||||
199
pageserver/src/tenant/timeline/init.rs
Normal file
199
pageserver/src/tenant/timeline/init.rs
Normal file
@@ -0,0 +1,199 @@
|
||||
use crate::{
|
||||
is_temporary,
|
||||
tenant::{
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
remote_timeline_client::{
|
||||
self,
|
||||
index::{IndexPart, LayerFileMetadata},
|
||||
},
|
||||
storage_layer::LayerFileName,
|
||||
},
|
||||
METADATA_FILE_NAME,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use std::{collections::HashMap, ffi::OsString, path::Path, str::FromStr};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Identified files in the timeline directory.
|
||||
pub(super) enum Discovered {
|
||||
/// The only one we care about
|
||||
Layer(LayerFileName, u64),
|
||||
/// Old ephmeral files from previous launches, should be removed
|
||||
Ephemeral(OsString),
|
||||
/// Old temporary timeline files, unsure what these really are, should be removed
|
||||
Temporary(OsString),
|
||||
/// Temporary on-demand download files, should be removed
|
||||
TemporaryDownload(OsString),
|
||||
/// "metadata" file we persist locally and include in `index_part.json`
|
||||
Metadata,
|
||||
/// Backup file from previously future layers
|
||||
IgnoredBackup,
|
||||
/// Unrecognized, warn about these
|
||||
Unknown(OsString),
|
||||
}
|
||||
|
||||
/// Scans the timeline directory for interesting files.
|
||||
pub(super) fn scan_timeline_dir(path: &Path) -> anyhow::Result<Vec<Discovered>> {
|
||||
let mut ret = Vec::new();
|
||||
|
||||
for direntry in std::fs::read_dir(path)? {
|
||||
let direntry = direntry?;
|
||||
let direntry_path = direntry.path();
|
||||
let file_name = direntry.file_name();
|
||||
|
||||
let fname = file_name.to_string_lossy();
|
||||
|
||||
let discovered = match LayerFileName::from_str(&fname) {
|
||||
Ok(file_name) => {
|
||||
let file_size = direntry.metadata()?.len();
|
||||
Discovered::Layer(file_name, file_size)
|
||||
}
|
||||
Err(_) => {
|
||||
if fname == METADATA_FILE_NAME {
|
||||
Discovered::Metadata
|
||||
} else if fname.ends_with(".old") {
|
||||
// ignore these
|
||||
Discovered::IgnoredBackup
|
||||
} else if remote_timeline_client::is_temp_download_file(&direntry_path) {
|
||||
Discovered::TemporaryDownload(file_name)
|
||||
} else if is_ephemeral_file(&fname) {
|
||||
Discovered::Ephemeral(file_name)
|
||||
} else if is_temporary(&direntry_path) {
|
||||
Discovered::Temporary(file_name)
|
||||
} else {
|
||||
Discovered::Unknown(file_name)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
ret.push(discovered);
|
||||
}
|
||||
|
||||
Ok(ret)
|
||||
}
|
||||
|
||||
/// Decision on what to do with a layer file after considering its local and remote metadata.
|
||||
#[derive(Clone)]
|
||||
pub(super) enum Decision {
|
||||
/// The layer is not present locally.
|
||||
Evicted(LayerFileMetadata),
|
||||
/// The layer is present locally, but local metadata does not match remote; we must
|
||||
/// delete it and treat it as evicted.
|
||||
UseRemote {
|
||||
local: LayerFileMetadata,
|
||||
remote: LayerFileMetadata,
|
||||
},
|
||||
/// The layer is present locally, and metadata matches.
|
||||
UseLocal(LayerFileMetadata),
|
||||
/// The layer is only known locally, it needs to be uploaded.
|
||||
NeedsUpload(LayerFileMetadata),
|
||||
}
|
||||
|
||||
/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
|
||||
#[derive(Debug)]
|
||||
pub(super) struct FutureLayer {
|
||||
/// The local metadata. `None` if the layer is only known through [`IndexPart`].
|
||||
pub(super) local: Option<LayerFileMetadata>,
|
||||
}
|
||||
|
||||
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
|
||||
///
|
||||
/// This function should not gain additional reasons to fail than [`FutureLayer`], consider adding
|
||||
/// the checks earlier to [`scan_timeline_dir`].
|
||||
pub(super) fn reconcile(
|
||||
discovered: Vec<(LayerFileName, u64)>,
|
||||
index_part: Option<&IndexPart>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
) -> Vec<(LayerFileName, Result<Decision, FutureLayer>)> {
|
||||
use Decision::*;
|
||||
|
||||
// name => (local, remote)
|
||||
type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
|
||||
|
||||
let mut discovered = discovered
|
||||
.into_iter()
|
||||
.map(|(name, file_size)| (name, (Some(LayerFileMetadata::new(file_size)), None)))
|
||||
.collect::<Collected>();
|
||||
|
||||
// merge any index_part information, when available
|
||||
index_part
|
||||
.as_ref()
|
||||
.map(|ip| ip.layer_metadata.iter())
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
|
||||
.for_each(|(name, metadata)| {
|
||||
if let Some(existing) = discovered.get_mut(name) {
|
||||
existing.1 = Some(metadata);
|
||||
} else {
|
||||
discovered.insert(name.to_owned(), (None, Some(metadata)));
|
||||
}
|
||||
});
|
||||
|
||||
discovered
|
||||
.into_iter()
|
||||
.map(|(name, (local, remote))| {
|
||||
let decision = if name.is_in_future(disk_consistent_lsn) {
|
||||
Err(FutureLayer { local })
|
||||
} else {
|
||||
Ok(match (local, remote) {
|
||||
(Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
|
||||
(Some(x), Some(_)) => UseLocal(x),
|
||||
(None, Some(x)) => Evicted(x),
|
||||
(Some(x), None) => NeedsUpload(x),
|
||||
(None, None) => {
|
||||
unreachable!("there must not be any non-local non-remote files")
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
(name, decision)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
pub(super) fn cleanup(path: &Path, kind: &str) -> anyhow::Result<()> {
|
||||
let file_name = path.file_name().expect("must be file path");
|
||||
tracing::debug!(kind, ?file_name, "cleaning up");
|
||||
std::fs::remove_file(path)
|
||||
.with_context(|| format!("failed to remove {kind} at {}", path.display()))
|
||||
}
|
||||
|
||||
pub(super) fn cleanup_local_file_for_remote(
|
||||
path: &Path,
|
||||
local: &LayerFileMetadata,
|
||||
remote: &LayerFileMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
let local_size = local.file_size();
|
||||
let remote_size = remote.file_size();
|
||||
|
||||
let file_name = path.file_name().expect("must be file path");
|
||||
tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
|
||||
if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
|
||||
assert!(
|
||||
path.exists(),
|
||||
"we would leave the local_layer without a file if this does not hold: {}",
|
||||
path.display()
|
||||
);
|
||||
Err(err)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn cleanup_future_layer(
|
||||
path: &Path,
|
||||
name: &LayerFileName,
|
||||
disk_consistent_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
use LayerFileName::*;
|
||||
let kind = match name {
|
||||
Delta(_) => "delta",
|
||||
Image(_) => "image",
|
||||
};
|
||||
// future image layers are allowed to be produced always for not yet flushed to disk
|
||||
// lsns stored in InMemoryLayer.
|
||||
tracing::info!("found future {kind} layer {name} disk_consistent_lsn is {disk_consistent_lsn}");
|
||||
crate::tenant::timeline::rename_to_backup(path)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -12,38 +12,38 @@ use crate::{
|
||||
tenant::{
|
||||
layer_map::{BatchedUpdates, LayerMap},
|
||||
storage_layer::{
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, Layer, PersistentLayer,
|
||||
PersistentLayerDesc, PersistentLayerKey, RemoteLayer,
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
|
||||
PersistentLayerDesc, PersistentLayerKey,
|
||||
},
|
||||
timeline::compare_arced_layers,
|
||||
},
|
||||
};
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
pub struct LayerManager {
|
||||
pub(crate) struct LayerManager {
|
||||
layer_map: LayerMap,
|
||||
layer_fmgr: LayerFileManager,
|
||||
}
|
||||
|
||||
/// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
|
||||
/// scheduling deletes in remote client.
|
||||
pub struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
|
||||
pub(crate) struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
|
||||
|
||||
impl ApplyGcResultGuard<'_> {
|
||||
pub fn flush(self) {
|
||||
pub(crate) fn flush(self) {
|
||||
self.0.flush();
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerManager {
|
||||
pub fn create() -> Self {
|
||||
pub(crate) fn create() -> Self {
|
||||
Self {
|
||||
layer_map: LayerMap::default(),
|
||||
layer_fmgr: LayerFileManager::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
|
||||
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
|
||||
self.layer_fmgr.get_from_desc(desc)
|
||||
}
|
||||
|
||||
@@ -51,18 +51,12 @@ impl LayerManager {
|
||||
///
|
||||
/// We expect users only to be able to get an immutable layer map. If users want to make modifications,
|
||||
/// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
|
||||
pub fn layer_map(&self) -> &LayerMap {
|
||||
pub(crate) fn layer_map(&self) -> &LayerMap {
|
||||
&self.layer_map
|
||||
}
|
||||
|
||||
/// Get a mutable reference to the layer map. This function will be removed once `flush_frozen_layer`
|
||||
/// gets a refactor.
|
||||
pub fn layer_map_mut(&mut self) -> &mut LayerMap {
|
||||
&mut self.layer_map
|
||||
}
|
||||
|
||||
/// Replace layers in the layer file manager, used in evictions and layer downloads.
|
||||
pub fn replace_and_verify(
|
||||
pub(crate) fn replace_and_verify(
|
||||
&mut self,
|
||||
expected: Arc<dyn PersistentLayer>,
|
||||
new: Arc<dyn PersistentLayer>,
|
||||
@@ -73,7 +67,7 @@ impl LayerManager {
|
||||
/// Called from `load_layer_map`. Initialize the layer manager with:
|
||||
/// 1. all on-disk layers
|
||||
/// 2. next open layer (with disk disk_consistent_lsn LSN)
|
||||
pub fn initialize_local_layers(
|
||||
pub(crate) fn initialize_local_layers(
|
||||
&mut self,
|
||||
on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||
next_open_layer_at: Lsn,
|
||||
@@ -87,28 +81,13 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Initialize when creating a new timeline, called in `init_empty_layer_map`.
|
||||
pub fn initialize_empty(&mut self, next_open_layer_at: Lsn) {
|
||||
pub(crate) fn initialize_empty(&mut self, next_open_layer_at: Lsn) {
|
||||
self.layer_map.next_open_layer_at = Some(next_open_layer_at);
|
||||
}
|
||||
|
||||
pub fn initialize_remote_layers(
|
||||
&mut self,
|
||||
corrupted_local_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||
remote_layers: Vec<Arc<RemoteLayer>>,
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for layer in corrupted_local_layers {
|
||||
Self::remove_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
for layer in remote_layers {
|
||||
Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
|
||||
/// called within `get_layer_for_write`.
|
||||
pub fn get_layer_for_write(
|
||||
pub(crate) fn get_layer_for_write(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
last_record_lsn: Lsn,
|
||||
@@ -163,7 +142,7 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
|
||||
pub async fn try_freeze_in_memory_layer(
|
||||
pub(crate) async fn try_freeze_in_memory_layer(
|
||||
&mut self,
|
||||
Lsn(last_record_lsn): Lsn,
|
||||
last_freeze_at: &AtomicLsn,
|
||||
@@ -185,7 +164,7 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Add image layers to the layer map, called from `create_image_layers`.
|
||||
pub fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
|
||||
pub(crate) fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for layer in image_layers {
|
||||
Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
|
||||
@@ -194,7 +173,7 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Flush a frozen layer and add the written delta layer to the layer map.
|
||||
pub fn finish_flush_l0_layer(
|
||||
pub(crate) fn finish_flush_l0_layer(
|
||||
&mut self,
|
||||
delta_layer: Option<DeltaLayer>,
|
||||
frozen_layer_for_check: &Arc<InMemoryLayer>,
|
||||
@@ -214,7 +193,7 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Called when compaction is completed.
|
||||
pub fn finish_compact_l0(
|
||||
pub(crate) fn finish_compact_l0(
|
||||
&mut self,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
compact_from: Vec<Arc<dyn PersistentLayer>>,
|
||||
@@ -242,7 +221,7 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
|
||||
pub fn finish_gc_timeline(
|
||||
pub(crate) fn finish_gc_timeline(
|
||||
&mut self,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
gc_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||
@@ -271,16 +250,6 @@ impl LayerManager {
|
||||
mapping.insert(layer);
|
||||
}
|
||||
|
||||
/// Helper function to remove a layer into the layer map and file manager
|
||||
fn remove_historic_layer(
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
updates: &mut BatchedUpdates<'_>,
|
||||
mapping: &mut LayerFileManager,
|
||||
) {
|
||||
updates.remove_historic(layer.layer_desc());
|
||||
mapping.remove(layer);
|
||||
}
|
||||
|
||||
/// Removes the layer from local FS (if present) and from memory.
|
||||
/// Remote storage is not affected by this operation.
|
||||
fn delete_historic_layer(
|
||||
@@ -313,7 +282,7 @@ impl LayerManager {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
|
||||
pub(crate) struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
|
||||
HashMap<PersistentLayerKey, Arc<T>>,
|
||||
);
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ use crate::metrics::{
|
||||
WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
|
||||
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::task_mgr::{shutdown_token, TaskKind};
|
||||
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
@@ -211,11 +211,14 @@ async fn subscribe_for_timeline_updates(
|
||||
id: TenantTimelineId,
|
||||
) -> Streaming<SafekeeperTimelineInfo> {
|
||||
let mut attempt = 0;
|
||||
let cancel = shutdown_token();
|
||||
|
||||
loop {
|
||||
exponential_backoff(
|
||||
attempt,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
attempt += 1;
|
||||
|
||||
@@ -140,36 +140,24 @@ impl UploadQueue {
|
||||
}
|
||||
}
|
||||
|
||||
let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
|
||||
for layer_name in &index_part.timeline_layers {
|
||||
match index_part
|
||||
.layer_metadata
|
||||
.get(layer_name)
|
||||
.map(LayerFileMetadata::from)
|
||||
{
|
||||
Some(layer_metadata) => {
|
||||
files.insert(layer_name.to_owned(), layer_metadata);
|
||||
}
|
||||
None => {
|
||||
anyhow::bail!(
|
||||
"No remote layer metadata found for layer {}",
|
||||
layer_name.file_name()
|
||||
);
|
||||
}
|
||||
}
|
||||
let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
|
||||
for (layer_name, layer_metadata) in &index_part.layer_metadata {
|
||||
files.insert(
|
||||
layer_name.to_owned(),
|
||||
LayerFileMetadata::from(layer_metadata),
|
||||
);
|
||||
}
|
||||
|
||||
let index_part_metadata = index_part.parse_metadata()?;
|
||||
info!(
|
||||
"initializing upload queue with remote index_part.disk_consistent_lsn: {}",
|
||||
index_part_metadata.disk_consistent_lsn()
|
||||
index_part.metadata.disk_consistent_lsn()
|
||||
);
|
||||
|
||||
let state = UploadQueueInitialized {
|
||||
latest_files: files,
|
||||
latest_files_changes_since_metadata_upload_scheduled: 0,
|
||||
latest_metadata: index_part_metadata.clone(),
|
||||
last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
|
||||
latest_metadata: index_part.metadata.clone(),
|
||||
last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
|
||||
// what follows are boring default initializations
|
||||
task_counter: 0,
|
||||
num_inprogress_layer_uploads: 0,
|
||||
|
||||
@@ -163,7 +163,6 @@ static void nwp_register_gucs(void);
|
||||
static void nwp_prepare_shmem(void);
|
||||
static uint64 backpressure_lag_impl(void);
|
||||
static bool backpressure_throttling_impl(void);
|
||||
static uint64 measure_replication_lag(void);
|
||||
|
||||
static process_interrupts_callback_t PrevProcessInterruptsCallback;
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook_type;
|
||||
@@ -172,8 +171,6 @@ static shmem_request_hook_type prev_shmem_request_hook = NULL;
|
||||
static void walproposer_shmem_request(void);
|
||||
#endif
|
||||
|
||||
static bool check_replication_lag;
|
||||
|
||||
void
|
||||
pg_init_walproposer(void)
|
||||
{
|
||||
@@ -2495,45 +2492,37 @@ backpressure_lag_impl(void)
|
||||
{
|
||||
if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0)
|
||||
{
|
||||
check_replication_lag = true;
|
||||
return measure_replication_lag();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static uint64
|
||||
measure_replication_lag(void)
|
||||
{
|
||||
XLogRecPtr writePtr;
|
||||
XLogRecPtr flushPtr;
|
||||
XLogRecPtr applyPtr;
|
||||
XLogRecPtr writePtr;
|
||||
XLogRecPtr flushPtr;
|
||||
XLogRecPtr applyPtr;
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL);
|
||||
XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL);
|
||||
#else
|
||||
XLogRecPtr myFlushLsn = GetFlushRecPtr();
|
||||
XLogRecPtr myFlushLsn = GetFlushRecPtr();
|
||||
#endif
|
||||
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
|
||||
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
|
||||
#define MB ((XLogRecPtr)1024 * 1024)
|
||||
|
||||
elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
|
||||
LSN_FORMAT_ARGS(myFlushLsn),
|
||||
LSN_FORMAT_ARGS(writePtr),
|
||||
LSN_FORMAT_ARGS(flushPtr),
|
||||
LSN_FORMAT_ARGS(applyPtr));
|
||||
elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
|
||||
LSN_FORMAT_ARGS(myFlushLsn),
|
||||
LSN_FORMAT_ARGS(writePtr),
|
||||
LSN_FORMAT_ARGS(flushPtr),
|
||||
LSN_FORMAT_ARGS(applyPtr));
|
||||
|
||||
if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 && myFlushLsn > writePtr + max_replication_write_lag * MB))
|
||||
{
|
||||
return (myFlushLsn - writePtr - max_replication_write_lag * MB);
|
||||
}
|
||||
if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 && myFlushLsn > writePtr + max_replication_write_lag * MB))
|
||||
{
|
||||
return (myFlushLsn - writePtr - max_replication_write_lag * MB);
|
||||
}
|
||||
|
||||
if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 && myFlushLsn > flushPtr + max_replication_flush_lag * MB))
|
||||
{
|
||||
return (myFlushLsn - flushPtr - max_replication_flush_lag * MB);
|
||||
}
|
||||
if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 && myFlushLsn > flushPtr + max_replication_flush_lag * MB))
|
||||
{
|
||||
return (myFlushLsn - flushPtr - max_replication_flush_lag * MB);
|
||||
}
|
||||
|
||||
if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 && myFlushLsn > applyPtr + max_replication_apply_lag * MB))
|
||||
{
|
||||
return (myFlushLsn - applyPtr - max_replication_apply_lag * MB);
|
||||
if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 && myFlushLsn > applyPtr + max_replication_apply_lag * MB))
|
||||
{
|
||||
return (myFlushLsn - applyPtr - max_replication_apply_lag * MB);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -2550,18 +2539,14 @@ backpressure_throttling_impl(void)
|
||||
? PrevProcessInterruptsCallback()
|
||||
: false;
|
||||
|
||||
/* Throttle onlhy backends writing WAL. */
|
||||
if (!check_replication_lag)
|
||||
/* Don't throttle read only transactions and wal sender. */
|
||||
if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
|
||||
return retry;
|
||||
|
||||
/* Calculate replication lag */
|
||||
lag = measure_replication_lag();
|
||||
/* Calculate replicas lag */
|
||||
lag = backpressure_lag_impl();
|
||||
if (lag == 0)
|
||||
{
|
||||
/* Do not measure replication lag before we writting something to the WAL */
|
||||
check_replication_lag = false;
|
||||
return retry;
|
||||
}
|
||||
|
||||
/* Suspend writers until replicas catch up */
|
||||
set_ps_display("backpressure throttling");
|
||||
|
||||
@@ -64,13 +64,13 @@ pub struct EndpointConnPool {
|
||||
total_conns: usize,
|
||||
}
|
||||
|
||||
/// This is cheap and not hugely secure.
|
||||
/// But probably good enough for in memory only hashes.
|
||||
/// 4096 is the number of rounds that SCRAM-SHA-256 recommends.
|
||||
/// It's not the 600,000 that OWASP recommends... but our passwords are high entropy anyway.
|
||||
///
|
||||
/// Still takes 3.5ms to hash on my hardware.
|
||||
/// Still takes 1.4ms to hash on my hardware.
|
||||
/// We don't want to ruin the latency improvements of using the pool by making password verification take too long
|
||||
const PARAMS: Params = Params {
|
||||
rounds: 10_000,
|
||||
rounds: 4096,
|
||||
output_length: 32,
|
||||
};
|
||||
|
||||
@@ -99,6 +99,10 @@ pub struct GlobalConnPool {
|
||||
max_conns_per_endpoint: usize,
|
||||
|
||||
proxy_config: &'static crate::config::ProxyConfig,
|
||||
|
||||
// Using a lock to remove any race conditions.
|
||||
// Eg cleaning up connections while a new connection is returned
|
||||
closed: RwLock<bool>,
|
||||
}
|
||||
|
||||
impl GlobalConnPool {
|
||||
@@ -108,9 +112,24 @@ impl GlobalConnPool {
|
||||
global_pool_size: AtomicUsize::new(0),
|
||||
max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
|
||||
proxy_config: config,
|
||||
closed: RwLock::new(false),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn shutdown(&self) {
|
||||
*self.closed.write() = true;
|
||||
|
||||
self.global_pool.retain(|_, endpoint_pool| {
|
||||
let mut pool = endpoint_pool.write();
|
||||
// by clearing this hashmap, we remove the slots that a connection can be returned to.
|
||||
// when returning, it drops the connection if the slot doesn't exist
|
||||
pool.pools.clear();
|
||||
pool.total_conns = 0;
|
||||
|
||||
false
|
||||
});
|
||||
}
|
||||
|
||||
pub async fn get(
|
||||
&self,
|
||||
conn_info: &ConnInfo,
|
||||
@@ -208,7 +227,20 @@ impl GlobalConnPool {
|
||||
new_client
|
||||
}
|
||||
|
||||
pub async fn put(&self, conn_info: &ConnInfo, client: Client) -> anyhow::Result<()> {
|
||||
pub fn put(&self, conn_info: &ConnInfo, client: Client) -> anyhow::Result<()> {
|
||||
// We want to hold this open while we return. This ensures that the pool can't close
|
||||
// while we are in the middle of returning the connection.
|
||||
let closed = self.closed.read();
|
||||
if *closed {
|
||||
info!("pool: throwing away connection '{conn_info}' because pool is closed");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if client.inner.is_closed() {
|
||||
info!("pool: throwing away connection '{conn_info}' because connection is closed");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
|
||||
|
||||
// return connection to the pool
|
||||
@@ -376,9 +408,9 @@ async fn connect_to_compute_once(
|
||||
let (tx, mut rx) = tokio::sync::watch::channel(session);
|
||||
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
let span = info_span!(parent: None, "connection", %conn_info, %conn_id);
|
||||
let span = info_span!(parent: None, "connection", %conn_id);
|
||||
span.in_scope(|| {
|
||||
info!(%session, "new connection");
|
||||
info!(%conn_info, %session, "new connection");
|
||||
});
|
||||
|
||||
tokio::spawn(
|
||||
@@ -388,26 +420,28 @@ async fn connect_to_compute_once(
|
||||
info!(%session, "changed session");
|
||||
}
|
||||
|
||||
let message = ready!(connection.poll_message(cx));
|
||||
loop {
|
||||
let message = ready!(connection.poll_message(cx));
|
||||
|
||||
match message {
|
||||
Some(Ok(AsyncMessage::Notice(notice))) => {
|
||||
info!(%session, "notice: {}", notice);
|
||||
Poll::Pending
|
||||
match message {
|
||||
Some(Ok(AsyncMessage::Notice(notice))) => {
|
||||
info!(%session, "notice: {}", notice);
|
||||
}
|
||||
Some(Ok(AsyncMessage::Notification(notif))) => {
|
||||
warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
|
||||
}
|
||||
Some(Ok(_)) => {
|
||||
warn!(%session, "unknown message");
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
error!(%session, "connection error: {}", e);
|
||||
return Poll::Ready(())
|
||||
}
|
||||
None => {
|
||||
info!("connection closed");
|
||||
return Poll::Ready(())
|
||||
}
|
||||
}
|
||||
Some(Ok(AsyncMessage::Notification(notif))) => {
|
||||
warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
|
||||
Poll::Pending
|
||||
}
|
||||
Some(Ok(_)) => {
|
||||
warn!(%session, "unknown message");
|
||||
Poll::Pending
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
error!(%session, "connection error: {}", e);
|
||||
Poll::Ready(())
|
||||
}
|
||||
None => Poll::Ready(()),
|
||||
}
|
||||
})
|
||||
.instrument(span)
|
||||
|
||||
@@ -16,7 +16,6 @@ use tokio_postgres::types::Type;
|
||||
use tokio_postgres::GenericClient;
|
||||
use tokio_postgres::IsolationLevel;
|
||||
use tokio_postgres::Row;
|
||||
use tracing::Instrument;
|
||||
use url::Url;
|
||||
|
||||
use super::conn_pool::ConnInfo;
|
||||
@@ -286,13 +285,12 @@ pub async fn handle(
|
||||
};
|
||||
|
||||
if allow_pool {
|
||||
let current_span = tracing::Span::current();
|
||||
// return connection to the pool
|
||||
tokio::task::spawn(
|
||||
async move {
|
||||
let _ = conn_pool.put(&conn_info, client).await;
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _span = current_span.enter();
|
||||
let _ = conn_pool.put(&conn_info, client);
|
||||
});
|
||||
}
|
||||
|
||||
result
|
||||
|
||||
@@ -269,6 +269,18 @@ pub async fn task_main(
|
||||
|
||||
let conn_pool: Arc<GlobalConnPool> = GlobalConnPool::new(config);
|
||||
|
||||
// shutdown the connection pool
|
||||
tokio::spawn({
|
||||
let cancellation_token = cancellation_token.clone();
|
||||
let conn_pool = conn_pool.clone();
|
||||
async move {
|
||||
cancellation_token.cancelled().await;
|
||||
tokio::task::spawn_blocking(move || conn_pool.shutdown())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
});
|
||||
|
||||
let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
|
||||
let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
|
||||
Some(config) => config.into(),
|
||||
|
||||
@@ -15,8 +15,10 @@ use tokio::fs::File;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use utils::http::endpoint::request_span;
|
||||
|
||||
use crate::receive_wal::WalReceiverState;
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::safekeeper::Term;
|
||||
use crate::send_wal::WalSenderState;
|
||||
use crate::{debug_dump, pull_timeline};
|
||||
|
||||
use crate::timelines_global_map::TimelineDeleteForceResult;
|
||||
@@ -99,6 +101,8 @@ pub struct TimelineStatus {
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
pub walsenders: Vec<WalSenderState>,
|
||||
pub walreceivers: Vec<WalReceiverState>,
|
||||
}
|
||||
|
||||
fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Result<(), ApiError> {
|
||||
@@ -149,6 +153,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
backup_lsn: inmem.backup_lsn,
|
||||
peer_horizon_lsn: inmem.peer_horizon_lsn,
|
||||
remote_consistent_lsn: tli.get_walsenders().get_remote_consistent_lsn(),
|
||||
walsenders: tli.get_walsenders().get_all(),
|
||||
walreceivers: tli.get_walreceivers().get_all(),
|
||||
};
|
||||
json_response(StatusCode::OK, status)
|
||||
}
|
||||
|
||||
@@ -11,11 +11,16 @@ use crate::wal_service::ConnectionId;
|
||||
use crate::GlobalTimelines;
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::BytesMut;
|
||||
use parking_lot::MappedMutexGuard;
|
||||
use parking_lot::Mutex;
|
||||
use parking_lot::MutexGuard;
|
||||
use postgres_backend::CopyStreamHandlerEnd;
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::PostgresBackendReader;
|
||||
use postgres_backend::QueryError;
|
||||
use pq_proto::BeMessage;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use tokio::io::AsyncRead;
|
||||
@@ -32,6 +37,105 @@ use tracing::*;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Registry of WalReceivers (compute connections). Timeline holds it (wrapped
|
||||
/// in Arc).
|
||||
pub struct WalReceivers {
|
||||
mutex: Mutex<WalReceiversShared>,
|
||||
}
|
||||
|
||||
/// Id under which walreceiver is registered in shmem.
|
||||
type WalReceiverId = usize;
|
||||
|
||||
impl WalReceivers {
|
||||
pub fn new() -> Arc<WalReceivers> {
|
||||
Arc::new(WalReceivers {
|
||||
mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
|
||||
})
|
||||
}
|
||||
|
||||
/// Register new walreceiver. Returned guard provides access to the slot and
|
||||
/// automatically deregisters in Drop.
|
||||
pub fn register(self: &Arc<WalReceivers>) -> WalReceiverGuard {
|
||||
let slots = &mut self.mutex.lock().slots;
|
||||
let walreceiver = WalReceiverState::Voting;
|
||||
// find empty slot or create new one
|
||||
let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
|
||||
slots[pos] = Some(walreceiver);
|
||||
pos
|
||||
} else {
|
||||
let pos = slots.len();
|
||||
slots.push(Some(walreceiver));
|
||||
pos
|
||||
};
|
||||
WalReceiverGuard {
|
||||
id: pos,
|
||||
walreceivers: self.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get reference to locked slot contents. Slot must exist (registered
|
||||
/// earlier).
|
||||
fn get_slot<'a>(
|
||||
self: &'a Arc<WalReceivers>,
|
||||
id: WalReceiverId,
|
||||
) -> MappedMutexGuard<'a, WalReceiverState> {
|
||||
MutexGuard::map(self.mutex.lock(), |locked| {
|
||||
locked.slots[id]
|
||||
.as_mut()
|
||||
.expect("walreceiver doesn't exist")
|
||||
})
|
||||
}
|
||||
|
||||
/// Get number of walreceivers (compute connections).
|
||||
pub fn get_num(self: &Arc<WalReceivers>) -> usize {
|
||||
self.mutex.lock().slots.iter().flatten().count()
|
||||
}
|
||||
|
||||
/// Get state of all walreceivers.
|
||||
pub fn get_all(self: &Arc<WalReceivers>) -> Vec<WalReceiverState> {
|
||||
self.mutex.lock().slots.iter().flatten().cloned().collect()
|
||||
}
|
||||
|
||||
/// Unregister walsender.
|
||||
fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
|
||||
let mut shared = self.mutex.lock();
|
||||
shared.slots[id] = None;
|
||||
}
|
||||
}
|
||||
|
||||
/// Only a few connections are expected (normally one), so store in Vec.
|
||||
struct WalReceiversShared {
|
||||
slots: Vec<Option<WalReceiverState>>,
|
||||
}
|
||||
|
||||
/// Walreceiver status. Currently only whether it passed voting stage and
|
||||
/// started receiving the stream, but it is easy to add more if needed.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum WalReceiverState {
|
||||
Voting,
|
||||
Streaming,
|
||||
}
|
||||
|
||||
/// Scope guard to access slot in WalSenders registry and unregister from it in
|
||||
/// Drop.
|
||||
pub struct WalReceiverGuard {
|
||||
id: WalReceiverId,
|
||||
walreceivers: Arc<WalReceivers>,
|
||||
}
|
||||
|
||||
impl WalReceiverGuard {
|
||||
/// Get reference to locked shared state contents.
|
||||
fn get(&self) -> MappedMutexGuard<WalReceiverState> {
|
||||
self.walreceivers.get_slot(self.id)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalReceiverGuard {
|
||||
fn drop(&mut self) {
|
||||
self.walreceivers.unregister(self.id);
|
||||
}
|
||||
}
|
||||
|
||||
const MSG_QUEUE_SIZE: usize = 256;
|
||||
const REPLY_QUEUE_SIZE: usize = 16;
|
||||
|
||||
@@ -246,10 +350,13 @@ impl WalAcceptor {
|
||||
/// it must mean that network thread terminated.
|
||||
async fn run(&mut self) -> anyhow::Result<()> {
|
||||
// Register the connection and defer unregister.
|
||||
self.tli.on_compute_connect().await?;
|
||||
let _guard = ComputeConnectionGuard {
|
||||
// Order of the next two lines is important: we want first to remove our entry and then
|
||||
// update status which depends on registered connections.
|
||||
let _compute_conn_guard = ComputeConnectionGuard {
|
||||
timeline: Arc::clone(&self.tli),
|
||||
};
|
||||
let walreceiver_guard = self.tli.get_walreceivers().register();
|
||||
self.tli.update_status_notify().await?;
|
||||
|
||||
// After this timestamp we will stop processing AppendRequests and send a response
|
||||
// to the walproposer. walproposer sends at least one AppendRequest per second,
|
||||
@@ -263,6 +370,11 @@ impl WalAcceptor {
|
||||
}
|
||||
let mut next_msg = opt_msg.unwrap();
|
||||
|
||||
// Update walreceiver state in shmem for reporting.
|
||||
if let ProposerAcceptorMessage::Elected(_) = &next_msg {
|
||||
*walreceiver_guard.get() = WalReceiverState::Streaming;
|
||||
}
|
||||
|
||||
let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
|
||||
// loop through AppendRequest's while it's readily available to
|
||||
// write as many WAL as possible without fsyncing
|
||||
@@ -311,6 +423,7 @@ impl WalAcceptor {
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls update_status_notify in drop to update timeline status.
|
||||
struct ComputeConnectionGuard {
|
||||
timeline: Arc<Timeline>,
|
||||
}
|
||||
@@ -318,11 +431,9 @@ struct ComputeConnectionGuard {
|
||||
impl Drop for ComputeConnectionGuard {
|
||||
fn drop(&mut self) {
|
||||
let tli = self.timeline.clone();
|
||||
// tokio forbids to call blocking_send inside the runtime, and see
|
||||
// comments in on_compute_disconnect why we call blocking_send.
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = tli.on_compute_disconnect().await {
|
||||
error!("failed to unregister compute connection: {}", e);
|
||||
if let Err(e) = tli.update_status_notify().await {
|
||||
error!("failed to update timeline status: {}", e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ use utils::{
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
|
||||
SafekeeperMemState, ServerInfo, Term,
|
||||
@@ -164,8 +165,8 @@ impl SharedState {
|
||||
})
|
||||
}
|
||||
|
||||
fn is_active(&self, remote_consistent_lsn: Lsn) -> bool {
|
||||
self.is_wal_backup_required()
|
||||
fn is_active(&self, num_computes: usize, remote_consistent_lsn: Lsn) -> bool {
|
||||
self.is_wal_backup_required(num_computes)
|
||||
// FIXME: add tracking of relevant pageservers and check them here individually,
|
||||
// otherwise migration won't work (we suspend too early).
|
||||
|| remote_consistent_lsn < self.sk.inmem.commit_lsn
|
||||
@@ -173,29 +174,34 @@ impl SharedState {
|
||||
|
||||
/// Mark timeline active/inactive and return whether s3 offloading requires
|
||||
/// start/stop action.
|
||||
fn update_status(&mut self, remote_consistent_lsn: Lsn, ttid: TenantTimelineId) -> bool {
|
||||
let is_active = self.is_active(remote_consistent_lsn);
|
||||
fn update_status(
|
||||
&mut self,
|
||||
num_computes: usize,
|
||||
remote_consistent_lsn: Lsn,
|
||||
ttid: TenantTimelineId,
|
||||
) -> bool {
|
||||
let is_active = self.is_active(num_computes, remote_consistent_lsn);
|
||||
if self.active != is_active {
|
||||
info!("timeline {} active={} now", ttid, is_active);
|
||||
}
|
||||
self.active = is_active;
|
||||
self.is_wal_backup_action_pending()
|
||||
self.is_wal_backup_action_pending(num_computes)
|
||||
}
|
||||
|
||||
/// Should we run s3 offloading in current state?
|
||||
fn is_wal_backup_required(&self) -> bool {
|
||||
fn is_wal_backup_required(&self, num_computes: usize) -> bool {
|
||||
let seg_size = self.get_wal_seg_size();
|
||||
self.num_computes > 0 ||
|
||||
num_computes > 0 ||
|
||||
// Currently only the whole segment is offloaded, so compare segment numbers.
|
||||
(self.sk.inmem.commit_lsn.segment_number(seg_size) >
|
||||
self.sk.inmem.backup_lsn.segment_number(seg_size))
|
||||
(self.sk.inmem.commit_lsn.segment_number(seg_size) >
|
||||
self.sk.inmem.backup_lsn.segment_number(seg_size))
|
||||
}
|
||||
|
||||
/// Is current state of s3 offloading is not what it ought to be?
|
||||
fn is_wal_backup_action_pending(&self) -> bool {
|
||||
let res = self.wal_backup_active != self.is_wal_backup_required();
|
||||
fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool {
|
||||
let res = self.wal_backup_active != self.is_wal_backup_required(num_computes);
|
||||
if res {
|
||||
let action_pending = if self.is_wal_backup_required() {
|
||||
let action_pending = if self.is_wal_backup_required(num_computes) {
|
||||
"start"
|
||||
} else {
|
||||
"stop"
|
||||
@@ -210,8 +216,8 @@ impl SharedState {
|
||||
|
||||
/// Returns whether s3 offloading is required and sets current status as
|
||||
/// matching.
|
||||
fn wal_backup_attend(&mut self) -> bool {
|
||||
self.wal_backup_active = self.is_wal_backup_required();
|
||||
fn wal_backup_attend(&mut self, num_computes: usize) -> bool {
|
||||
self.wal_backup_active = self.is_wal_backup_required(num_computes);
|
||||
self.wal_backup_active
|
||||
}
|
||||
|
||||
@@ -295,6 +301,7 @@ pub struct Timeline {
|
||||
/// while holding it, ensuring that consensus checks are in order.
|
||||
mutex: Mutex<SharedState>,
|
||||
walsenders: Arc<WalSenders>,
|
||||
walreceivers: Arc<WalReceivers>,
|
||||
|
||||
/// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
|
||||
cancellation_tx: watch::Sender<bool>,
|
||||
@@ -329,6 +336,7 @@ impl Timeline {
|
||||
commit_lsn_watch_rx,
|
||||
mutex: Mutex::new(shared_state),
|
||||
walsenders: WalSenders::new(rcl),
|
||||
walreceivers: WalReceivers::new(),
|
||||
cancellation_rx,
|
||||
cancellation_tx,
|
||||
timeline_dir: conf.timeline_dir(&ttid),
|
||||
@@ -355,6 +363,7 @@ impl Timeline {
|
||||
commit_lsn_watch_rx,
|
||||
mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?),
|
||||
walsenders: WalSenders::new(Lsn(0)),
|
||||
walreceivers: WalReceivers::new(),
|
||||
cancellation_rx,
|
||||
cancellation_tx,
|
||||
timeline_dir: conf.timeline_dir(&ttid),
|
||||
@@ -441,40 +450,22 @@ impl Timeline {
|
||||
}
|
||||
|
||||
fn update_status(&self, shared_state: &mut SharedState) -> bool {
|
||||
shared_state.update_status(self.get_walsenders().get_remote_consistent_lsn(), self.ttid)
|
||||
shared_state.update_status(
|
||||
self.walreceivers.get_num(),
|
||||
self.get_walsenders().get_remote_consistent_lsn(),
|
||||
self.ttid,
|
||||
)
|
||||
}
|
||||
|
||||
/// Register compute connection, starting timeline-related activity if it is
|
||||
/// not running yet.
|
||||
pub async fn on_compute_connect(&self) -> Result<()> {
|
||||
/// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
|
||||
pub async fn update_status_notify(&self) -> Result<()> {
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
|
||||
let is_wal_backup_action_pending: bool;
|
||||
{
|
||||
let is_wal_backup_action_pending: bool = {
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
shared_state.num_computes += 1;
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state);
|
||||
}
|
||||
// Wake up wal backup launcher, if offloading not started yet.
|
||||
if is_wal_backup_action_pending {
|
||||
// Can fail only if channel to a static thread got closed, which is not normal at all.
|
||||
self.wal_backup_launcher_tx.send(self.ttid).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// De-register compute connection, shutting down timeline activity if
|
||||
/// pageserver doesn't need catchup.
|
||||
pub async fn on_compute_disconnect(&self) -> Result<()> {
|
||||
let is_wal_backup_action_pending: bool;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
shared_state.num_computes -= 1;
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state);
|
||||
}
|
||||
// Wake up wal backup launcher, if it is time to stop the offloading.
|
||||
self.update_status(&mut shared_state)
|
||||
};
|
||||
if is_wal_backup_action_pending {
|
||||
// Can fail only if channel to a static thread got closed, which is not normal at all.
|
||||
self.wal_backup_launcher_tx.send(self.ttid).await?;
|
||||
@@ -519,7 +510,9 @@ impl Timeline {
|
||||
return false;
|
||||
}
|
||||
|
||||
self.write_shared_state().await.wal_backup_attend()
|
||||
self.write_shared_state()
|
||||
.await
|
||||
.wal_backup_attend(self.walreceivers.get_num())
|
||||
}
|
||||
|
||||
/// Returns commit_lsn watch channel.
|
||||
@@ -650,6 +643,10 @@ impl Timeline {
|
||||
&self.walsenders
|
||||
}
|
||||
|
||||
pub fn get_walreceivers(&self) -> &Arc<WalReceivers> {
|
||||
&self.walreceivers
|
||||
}
|
||||
|
||||
/// Returns flush_lsn.
|
||||
pub async fn get_flush_lsn(&self) -> Lsn {
|
||||
self.write_shared_state().await.sk.wal_store.flush_lsn()
|
||||
|
||||
@@ -1,76 +0,0 @@
|
||||
#! /usr/bin/env python3
|
||||
# Script to generate ext_index.json metadata file
|
||||
# that stores content of the control files and location of extension archives
|
||||
# for all extensions in extensions subdir.
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
"""
|
||||
# ext_index.json example:
|
||||
{
|
||||
"public_extensions": [
|
||||
"anon"
|
||||
],
|
||||
"library_index": {
|
||||
"anon": "anon",
|
||||
// for more complex extensions like postgis
|
||||
// we might have something like:
|
||||
// address_standardizer: postgis
|
||||
// postgis_tiger: postgis
|
||||
},
|
||||
"extension_data": {
|
||||
"anon": {
|
||||
"control_data": {
|
||||
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
},
|
||||
"archive_path": "5648391853/v15/extensions/anon.tar.zst"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="generate ext_index.json")
|
||||
parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
|
||||
parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
|
||||
parser.add_argument("--public_extensions", type=str, help="list of public extensions")
|
||||
args = parser.parse_args()
|
||||
pg_version = args.pg_version
|
||||
BUILD_TAG = args.BUILD_TAG
|
||||
public_ext_list = args.public_extensions.split(",")
|
||||
|
||||
ext_index = {}
|
||||
library_index = {}
|
||||
EXT_PATH = Path("extensions")
|
||||
for extension in EXT_PATH.iterdir():
|
||||
if extension.is_dir():
|
||||
control_data = {}
|
||||
for control_file in extension.glob("*.control"):
|
||||
if control_file.suffix != ".control":
|
||||
continue
|
||||
with open(control_file, "r") as f:
|
||||
control_data[control_file.name] = f.read()
|
||||
ext_index[extension.name] = {
|
||||
"control_data": control_data,
|
||||
"archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
|
||||
}
|
||||
elif extension.suffix == ".zst":
|
||||
file_list = (
|
||||
str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
|
||||
.strip()
|
||||
.split("\n")
|
||||
)
|
||||
for file in file_list:
|
||||
if file.endswith(".so") and file.startswith("lib/"):
|
||||
lib_name = file[4:-3]
|
||||
library_index[lib_name] = extension.name.replace(".tar.zst", "")
|
||||
|
||||
all_data = {
|
||||
"public_extensions": public_ext_list,
|
||||
"library_index": library_index,
|
||||
"extension_data": ext_index,
|
||||
}
|
||||
with open("ext_index.json", "w") as f:
|
||||
json.dump(all_data, f)
|
||||
@@ -70,6 +70,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_getpage_reconstruct_seconds_count",
|
||||
"pageserver_getpage_reconstruct_seconds_sum",
|
||||
*[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
|
||||
*histogram("pageserver_smgr_query_seconds_global"),
|
||||
*histogram("pageserver_read_num_fs_layers"),
|
||||
*histogram("pageserver_getpage_get_reconstruct_data_seconds"),
|
||||
*histogram("pageserver_wait_lsn_seconds"),
|
||||
|
||||
@@ -369,7 +369,7 @@ def test_download_remote_layers_api(
|
||||
filled_current_physical = get_api_current_physical_size()
|
||||
log.info(filled_current_physical)
|
||||
filled_size = get_resident_physical_size()
|
||||
log.info(filled_size)
|
||||
log.info(f"filled_size: {filled_size}")
|
||||
assert filled_current_physical == filled_size, "we don't yet do layer eviction"
|
||||
|
||||
env.pageserver.stop()
|
||||
@@ -377,7 +377,7 @@ def test_download_remote_layers_api(
|
||||
# remove all the layer files
|
||||
# XXX only delete some of the layer files, to show that it really just downloads all the layers
|
||||
for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
|
||||
log.info(f"unlinking layer {layer}")
|
||||
log.info(f"unlinking layer {layer.name}")
|
||||
layer.unlink()
|
||||
|
||||
# Shut down safekeepers before starting the pageserver.
|
||||
@@ -403,7 +403,7 @@ def test_download_remote_layers_api(
|
||||
filled_current_physical == get_api_current_physical_size()
|
||||
), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
|
||||
post_unlink_size = get_resident_physical_size()
|
||||
log.info(post_unlink_size)
|
||||
log.info(f"post_unlink_size: {post_unlink_size}")
|
||||
assert (
|
||||
post_unlink_size < filled_size
|
||||
), "we just deleted layers and didn't cause anything to re-download them yet"
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
group neon-postgres {
|
||||
perm {
|
||||
admin {
|
||||
uid = vm-informant;
|
||||
}
|
||||
task {
|
||||
gid = users;
|
||||
}
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
@@ -14,11 +14,13 @@ publish = false
|
||||
### BEGIN HAKARI SECTION
|
||||
[dependencies]
|
||||
anyhow = { version = "1", features = ["backtrace"] }
|
||||
axum = { version = "0.6", features = ["ws"] }
|
||||
bytes = { version = "1", features = ["serde"] }
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
|
||||
clap = { version = "4", features = ["derive", "string"] }
|
||||
clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
|
||||
crossbeam-utils = { version = "0.8" }
|
||||
digest = { version = "0.10", features = ["mac", "std"] }
|
||||
either = { version = "1" }
|
||||
fail = { version = "0.5", default-features = false, features = ["failpoints"] }
|
||||
futures = { version = "0.3" }
|
||||
@@ -27,6 +29,7 @@ futures-core = { version = "0.3" }
|
||||
futures-executor = { version = "0.3" }
|
||||
futures-sink = { version = "0.3" }
|
||||
futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
|
||||
hyper = { version = "0.14", features = ["full"] }
|
||||
itertools = { version = "0.10" }
|
||||
libc = { version = "0.2", features = ["extra_traits"] }
|
||||
log = { version = "0.4", default-features = false, features = ["std"] }
|
||||
@@ -45,6 +48,7 @@ rustls = { version = "0.20", features = ["dangerous_configuration"] }
|
||||
scopeguard = { version = "1" }
|
||||
serde = { version = "1", features = ["alloc", "derive"] }
|
||||
serde_json = { version = "1", features = ["raw_value"] }
|
||||
smallvec = { version = "1", default-features = false, features = ["write"] }
|
||||
socket2 = { version = "0.4", default-features = false, features = ["all"] }
|
||||
tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
|
||||
tokio-rustls = { version = "0.23" }
|
||||
@@ -54,7 +58,6 @@ toml_edit = { version = "0.19", features = ["serde"] }
|
||||
tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
|
||||
tracing = { version = "0.1", features = ["log"] }
|
||||
tracing-core = { version = "0.1" }
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] }
|
||||
url = { version = "2", features = ["serde"] }
|
||||
|
||||
[build-dependencies]
|
||||
|
||||
Reference in New Issue
Block a user