mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-09 13:40:38 +00:00
Compare commits
2 Commits
jcsp/gener
...
alek/remot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
595baa386e | ||
|
|
bb8ca7c7fd |
@@ -145,11 +145,7 @@ runs:
|
||||
|
||||
if [ "${RERUN_FLAKY}" == "true" ]; then
|
||||
mkdir -p $TEST_OUTPUT
|
||||
poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \
|
||||
--days 7 \
|
||||
--output "$TEST_OUTPUT/flaky.json" \
|
||||
--pg-version "${DEFAULT_PG_VERSION}" \
|
||||
--build-type "${BUILD_TYPE}"
|
||||
poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"
|
||||
|
||||
EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
89
.github/workflows/build_and_test.yml
vendored
89
.github/workflows/build_and_test.yml
vendored
@@ -737,6 +737,34 @@ jobs:
|
||||
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
--cleanup
|
||||
|
||||
# Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
|
||||
# During the transition period we need to have extensions in both places (in S3 and in compute-node image),
|
||||
# so we won't build extension twice, but extract them from compute-node.
|
||||
#
|
||||
# For now we use extensions image only for new custom extensitons
|
||||
- name: Kaniko build extensions only
|
||||
run: |
|
||||
# Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
|
||||
# Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
|
||||
# it still fails with error:
|
||||
# error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
|
||||
#
|
||||
# Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
|
||||
find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
|
||||
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--context . \
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
|
||||
--build-arg PG_VERSION=${{ matrix.version }} \
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
|
||||
--dockerfile Dockerfile.compute-node \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||
--destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||
--cleanup \
|
||||
--target postgres-extensions
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
@@ -752,7 +780,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.17.5
|
||||
VM_BUILDER_VERSION: v0.16.3
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -775,7 +803,7 @@ jobs:
|
||||
run: |
|
||||
./vm-builder \
|
||||
-enable-file-cache \
|
||||
-cgroup-uid=postgres \
|
||||
-enable-monitor \
|
||||
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
@@ -858,8 +886,10 @@ jobs:
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Push images to production ECR
|
||||
if: |
|
||||
@@ -870,8 +900,10 @@ jobs:
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest
|
||||
|
||||
- name: Configure Docker Hub login
|
||||
run: |
|
||||
@@ -893,8 +925,10 @@ jobs:
|
||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
@@ -904,7 +938,7 @@ jobs:
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
needs: [ tag ]
|
||||
needs: [ promote-images, tag ]
|
||||
steps:
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
run: |
|
||||
@@ -939,10 +973,57 @@ jobs:
|
||||
}
|
||||
}"
|
||||
|
||||
upload-postgres-extensions-to-s3:
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
|
||||
needs: [ tag, promote-images ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15 ]
|
||||
|
||||
env:
|
||||
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ github.ref_name == 'release' && 'latest' || needs.tag.outputs.build-tag }}
|
||||
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
|
||||
S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
|
||||
|
||||
steps:
|
||||
- name: Pull postgres-extensions image
|
||||
run: |
|
||||
docker pull ${EXTENSIONS_IMAGE}
|
||||
|
||||
- name: Create postgres-extensions container
|
||||
id: create-container
|
||||
run: |
|
||||
EID=$(docker create ${EXTENSIONS_IMAGE} true)
|
||||
echo "EID=${EID}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Extract postgres-extensions from container
|
||||
run: |
|
||||
rm -rf ./extensions-to-upload # Just in case
|
||||
mkdir -p extensions-to-upload
|
||||
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
|
||||
|
||||
- name: Upload postgres-extensions to S3
|
||||
run: |
|
||||
for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
|
||||
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
|
||||
done
|
||||
|
||||
- name: Cleanup
|
||||
if: ${{ always() && steps.create-container.outputs.EID }}
|
||||
run: |
|
||||
docker rm ${{ steps.create-container.outputs.EID }} || true
|
||||
|
||||
deploy:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
needs: [ promote-images, tag, regress-tests ]
|
||||
needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
|
||||
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
|
||||
13
CODEOWNERS
13
CODEOWNERS
@@ -1,12 +1,11 @@
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/compute_tools/ @neondatabase/control-plane
|
||||
/control_plane/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
|
||||
/pageserver/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/pageserver/ @neondatabase/compute @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/proxy/ @neondatabase/proxy
|
||||
/proxy/ @neondatabase/control-plane
|
||||
/safekeeper/ @neondatabase/safekeepers
|
||||
/vendor/ @neondatabase/compute
|
||||
|
||||
228
Cargo.lock
generated
228
Cargo.lock
generated
@@ -190,7 +190,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -201,7 +201,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -553,13 +553,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.6.20"
|
||||
version = "0.6.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
|
||||
checksum = "f8175979259124331c1d7bf6586ee7e0da434155e4b2d48ec2c8386281d8df39"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum-core",
|
||||
"base64 0.21.1",
|
||||
"bitflags",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
@@ -574,13 +573,7 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_path_to_error",
|
||||
"serde_urlencoded",
|
||||
"sha1",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-tungstenite 0.20.0",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
@@ -680,7 +673,7 @@ dependencies = [
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
"shlex",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
"which",
|
||||
]
|
||||
|
||||
@@ -772,19 +765,6 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "cgroups-rs"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fb3af90c8d48ad5f432d8afb521b5b40c2a2fce46dd60e05912de51c47fba64"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"nix 0.25.1",
|
||||
"regex",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.24"
|
||||
@@ -869,7 +849,7 @@ dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -927,7 +907,6 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"cfg-if",
|
||||
"chrono",
|
||||
"clap",
|
||||
"compute_api",
|
||||
@@ -946,7 +925,6 @@ dependencies = [
|
||||
"tar",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
@@ -954,7 +932,6 @@ dependencies = [
|
||||
"tracing-utils",
|
||||
"url",
|
||||
"utils",
|
||||
"vm_monitor",
|
||||
"workspace_hack",
|
||||
"zstd",
|
||||
]
|
||||
@@ -1001,8 +978,7 @@ dependencies = [
|
||||
"comfy-table",
|
||||
"compute_api",
|
||||
"git-version",
|
||||
"hyper",
|
||||
"nix 0.26.2",
|
||||
"nix",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"postgres",
|
||||
@@ -1017,7 +993,6 @@ dependencies = [
|
||||
"storage_broker",
|
||||
"tar",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"toml",
|
||||
"tracing",
|
||||
"url",
|
||||
@@ -1209,7 +1184,7 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1220,7 +1195,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1285,7 +1260,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1341,7 +1316,7 @@ dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1537,7 +1512,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1888,8 +1863,8 @@ dependencies = [
|
||||
"hyper",
|
||||
"pin-project",
|
||||
"tokio",
|
||||
"tokio-tungstenite 0.18.0",
|
||||
"tungstenite 0.18.0",
|
||||
"tokio-tungstenite",
|
||||
"tungstenite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1953,19 +1928,6 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inotify"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"futures-core",
|
||||
"inotify-sys",
|
||||
"libc",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inotify-sys"
|
||||
version = "0.1.5"
|
||||
@@ -2289,18 +2251,6 @@ dependencies = [
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.25.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.26.2"
|
||||
@@ -2335,7 +2285,7 @@ dependencies = [
|
||||
"crossbeam-channel",
|
||||
"filetime",
|
||||
"fsevent-sys",
|
||||
"inotify 0.9.6",
|
||||
"inotify",
|
||||
"kqueue",
|
||||
"libc",
|
||||
"mio",
|
||||
@@ -2343,15 +2293,6 @@ dependencies = [
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ntapi"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint"
|
||||
version = "0.4.3"
|
||||
@@ -2445,7 +2386,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2632,7 +2573,7 @@ dependencies = [
|
||||
"hyper",
|
||||
"itertools",
|
||||
"metrics",
|
||||
"nix 0.26.2",
|
||||
"nix",
|
||||
"num-traits",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
@@ -2655,7 +2596,6 @@ dependencies = [
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"signal-hook",
|
||||
"smallvec",
|
||||
"storage_broker",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
@@ -2686,7 +2626,6 @@ dependencies = [
|
||||
"bytes",
|
||||
"const_format",
|
||||
"enum-map",
|
||||
"hex",
|
||||
"postgres_ffi",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -2834,7 +2773,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3031,7 +2970,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3042,9 +2981,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.66"
|
||||
version = "1.0.64"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
|
||||
checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
@@ -3206,9 +3145,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.32"
|
||||
version = "1.0.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
|
||||
checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
@@ -3859,22 +3798,22 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.183"
|
||||
version = "1.0.163"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
|
||||
checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.183"
|
||||
version = "1.0.163"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
|
||||
checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3888,16 +3827,6 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_path_to_error"
|
||||
version = "0.1.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4beec8bce849d58d06238cb50db2e1c417cfeafa4c63f692b15c82b7c80f8335"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_spanned"
|
||||
version = "0.6.2"
|
||||
@@ -3944,7 +3873,7 @@ dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4043,9 +3972,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.11.0"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
|
||||
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
@@ -4182,9 +4111,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.28"
|
||||
version = "2.0.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
|
||||
checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -4209,21 +4138,6 @@ dependencies = [
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sysinfo"
|
||||
version = "0.29.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "165d6d8539689e3d3bc8b98ac59541e1f21c7de7c85d60dc80e43ae0ed2113db"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
"ntapi",
|
||||
"once_cell",
|
||||
"rayon",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tar"
|
||||
version = "0.4.40"
|
||||
@@ -4314,7 +4228,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4429,7 +4343,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4535,19 +4449,7 @@ dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.18.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tungstenite"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b2dbec703c26b00d74844519606ef15d09a7d6857860f84ad223dec002ddea2"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.20.0",
|
||||
"tungstenite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4739,7 +4641,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4868,25 +4770,6 @@ dependencies = [
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tungstenite"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e862a1c4128df0112ab625f55cd5c934bcb4312ba80b39ae4b4835a3fd58e649"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"data-encoding",
|
||||
"http",
|
||||
"httparse",
|
||||
"log",
|
||||
"rand",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"url",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.16.0"
|
||||
@@ -5014,10 +4897,9 @@ dependencies = [
|
||||
"hyper",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.26.2",
|
||||
"nix",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"postgres_connection",
|
||||
"pq_proto",
|
||||
"rand",
|
||||
"regex",
|
||||
@@ -5033,7 +4915,6 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
@@ -5070,28 +4951,6 @@ version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||
|
||||
[[package]]
|
||||
name = "vm_monitor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"cgroups-rs",
|
||||
"clap",
|
||||
"futures",
|
||||
"inotify 0.10.2",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sysinfo",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "vsimd"
|
||||
version = "0.8.0"
|
||||
@@ -5162,7 +5021,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
@@ -5196,7 +5055,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
@@ -5481,14 +5340,12 @@ name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"bytes",
|
||||
"cc",
|
||||
"chrono",
|
||||
"clap",
|
||||
"clap_builder",
|
||||
"crossbeam-utils",
|
||||
"digest",
|
||||
"either",
|
||||
"fail",
|
||||
"futures",
|
||||
@@ -5497,7 +5354,6 @@ dependencies = [
|
||||
"futures-executor",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"libc",
|
||||
"log",
|
||||
@@ -5516,10 +5372,9 @@ dependencies = [
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"smallvec",
|
||||
"socket2 0.4.9",
|
||||
"syn 1.0.109",
|
||||
"syn 2.0.28",
|
||||
"syn 2.0.16",
|
||||
"tokio",
|
||||
"tokio-rustls 0.23.4",
|
||||
"tokio-util",
|
||||
@@ -5528,6 +5383,7 @@ dependencies = [
|
||||
"tower",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
]
|
||||
|
||||
|
||||
@@ -23,7 +23,6 @@ members = [
|
||||
"libs/remote_storage",
|
||||
"libs/tracing-utils",
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
"libs/vm_monitor",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -42,14 +41,12 @@ aws-sdk-s3 = "0.27"
|
||||
aws-smithy-http = "0.55"
|
||||
aws-credential-types = "0.55"
|
||||
aws-types = "0.55"
|
||||
axum = { version = "0.6.20", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.65"
|
||||
bstr = "1.0"
|
||||
byteorder = "1.4"
|
||||
bytes = "1.0"
|
||||
cfg-if = "1.0.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
close_fds = "0.3.2"
|
||||
@@ -77,7 +74,6 @@ humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.9"
|
||||
inotify = "0.10.2"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "8"
|
||||
libc = "0.2"
|
||||
@@ -109,14 +105,12 @@ rustls = "0.20"
|
||||
rustls-pemfile = "1"
|
||||
rustls-split = "0.3"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "2.0"
|
||||
sha2 = "0.10.2"
|
||||
signal-hook = "0.3"
|
||||
smallvec = "1.11"
|
||||
socket2 = "0.5"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
@@ -139,7 +133,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.19.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
|
||||
url = "2.2"
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
@@ -175,7 +169,6 @@ storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main br
|
||||
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
@@ -211,8 +211,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
|
||||
echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
@@ -764,6 +764,29 @@ RUN rm -r /usr/local/pgsql/include
|
||||
# if they were to be used by other libraries.
|
||||
RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Extenstion only
|
||||
#
|
||||
#########################################################################################
|
||||
FROM python:3.9-slim-bullseye AS generate-ext-index
|
||||
ARG PG_VERSION
|
||||
ARG BUILD_TAG
|
||||
RUN apt update && apt install -y zstd
|
||||
|
||||
# copy the control files here
|
||||
COPY --from=kq-imcx-pg-build /extensions/ /extensions/
|
||||
COPY --from=pg-anon-pg-build /extensions/ /extensions/
|
||||
COPY --from=postgis-build /extensions/ /extensions/
|
||||
COPY scripts/combine_control_files.py ./combine_control_files.py
|
||||
RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
|
||||
|
||||
FROM scratch AS postgres-extensions
|
||||
# After the transition this layer will include all extensitons.
|
||||
# As for now, it's only a couple for testing purposses
|
||||
COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
|
||||
COPY --from=generate-ext-index /ext_index.json /ext_index.json
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Final layer
|
||||
|
||||
@@ -8,7 +8,6 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
chrono.workspace = true
|
||||
cfg-if.workspace = true
|
||||
clap.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
@@ -24,7 +23,6 @@ tar.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
@@ -36,5 +34,4 @@ utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
||||
zstd = "0.12.4"
|
||||
|
||||
@@ -19,10 +19,9 @@ Also `compute_ctl` spawns two separate service threads:
|
||||
- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
last activity requests.
|
||||
|
||||
If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
|
||||
`vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
|
||||
`vm-monitor` communicates with the VM autoscaling system. It coordinates
|
||||
downscaling and requests immediate upscaling under resource pressure.
|
||||
If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
|
||||
compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
|
||||
downscaling and (eventually) will request immediate upscaling under resource pressure.
|
||||
|
||||
Usage example:
|
||||
```sh
|
||||
|
||||
@@ -20,10 +20,9 @@
|
||||
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
//! last activity requests.
|
||||
//!
|
||||
//! If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
|
||||
//! `vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
|
||||
//! `vm-monitor` communicates with the VM autoscaling system. It coordinates
|
||||
//! downscaling and requests immediate upscaling under resource pressure.
|
||||
//! If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
|
||||
//! compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
|
||||
//! downscaling and (eventually) will request immediate upscaling under resource pressure.
|
||||
//!
|
||||
//! Usage example:
|
||||
//! ```sh
|
||||
@@ -36,6 +35,7 @@
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
||||
@@ -271,57 +271,6 @@ fn main() -> Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
|
||||
// because it requires cgroups.
|
||||
cfg_if::cfg_if! {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
use std::env;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
|
||||
let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
|
||||
let cgroup = matches.get_one::<String>("cgroup");
|
||||
let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
|
||||
|
||||
// Only make a runtime if we need to.
|
||||
// Note: it seems like you can make a runtime in an inner scope and
|
||||
// if you start a task in it it won't be dropped. However, make it
|
||||
// in the outermost scope just to be safe.
|
||||
let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
|
||||
(None, None) => None,
|
||||
(None, Some(_)) => {
|
||||
warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
|
||||
None
|
||||
}
|
||||
(Some(_), None) => {
|
||||
panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
|
||||
}
|
||||
(Some(_), Some(_)) => Some(
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.worker_threads(4)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create tokio runtime for monitor"),
|
||||
),
|
||||
};
|
||||
|
||||
// This token is used internally by the monitor to clean up all threads
|
||||
let token = CancellationToken::new();
|
||||
|
||||
let vm_monitor = &rt.as_ref().map(|rt| {
|
||||
rt.spawn(vm_monitor::start(
|
||||
Box::leak(Box::new(vm_monitor::Args {
|
||||
cgroup: cgroup.cloned(),
|
||||
pgconnstr: file_cache_connstr.cloned(),
|
||||
addr: vm_monitor_addr.cloned().unwrap(),
|
||||
file_cache_on_disk,
|
||||
})),
|
||||
token.clone(),
|
||||
))
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
||||
// propagate to Postgres and it will be shut down as well.
|
||||
if let Some(mut pg) = pg {
|
||||
@@ -335,24 +284,6 @@ fn main() -> Result<()> {
|
||||
exit_code = ecode.code()
|
||||
}
|
||||
|
||||
// Terminate the vm_monitor so it releases the file watcher on
|
||||
// /sys/fs/cgroup/neon-postgres.
|
||||
// Note: the vm-monitor only runs on linux because it requires cgroups.
|
||||
cfg_if::cfg_if! {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
if let Some(handle) = vm_monitor {
|
||||
// Kills all threads spawned by the monitor
|
||||
token.cancel();
|
||||
// Kills the actual task running the monitor
|
||||
handle.abort();
|
||||
|
||||
// If handle is some, rt must have been used to produce it, and
|
||||
// hence is also some
|
||||
rt.unwrap().shutdown_timeout(Duration::from_secs(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Maybe sync safekeepers again, to speed up next startup
|
||||
let compute_state = compute.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
@@ -462,34 +393,6 @@ fn cli() -> clap::Command {
|
||||
.long("remote-ext-config")
|
||||
.value_name("REMOTE_EXT_CONFIG"),
|
||||
)
|
||||
// TODO(fprasx): we currently have default arguments because the cloud PR
|
||||
// to pass them in hasn't been merged yet. We should get rid of them once
|
||||
// the PR is merged.
|
||||
.arg(
|
||||
Arg::new("vm-monitor-addr")
|
||||
.long("vm-monitor-addr")
|
||||
.default_value("0.0.0.0:10301")
|
||||
.value_name("VM_MONITOR_ADDR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("cgroup")
|
||||
.long("cgroup")
|
||||
.default_value("neon-postgres")
|
||||
.value_name("CGROUP"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("filecache-connstr")
|
||||
.long("filecache-connstr")
|
||||
.default_value(
|
||||
"host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
|
||||
)
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("file-cache-on-disk")
|
||||
.long("file-cache-on-disk")
|
||||
.action(clap::ArgAction::SetTrue),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
@@ -176,27 +175,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
}
|
||||
}
|
||||
|
||||
/// If we are a VM, returns a [`Command`] that will run in the `neon-postgres`
|
||||
/// cgroup. Otherwise returns the default `Command::new(cmd)`
|
||||
///
|
||||
/// This function should be used to start postgres, as it will start it in the
|
||||
/// neon-postgres cgroup if we are a VM. This allows autoscaling to control
|
||||
/// postgres' resource usage. The cgroup will exist in VMs because vm-builder
|
||||
/// creates it during the sysinit phase of its inittab.
|
||||
fn maybe_cgexec(cmd: &str) -> Command {
|
||||
// The cplane sets this env var for autoscaling computes.
|
||||
// use `var_os` so we don't have to worry about the variable being valid
|
||||
// unicode. Should never be an concern . . . but just in case
|
||||
if env::var_os("AUTOSCALING").is_some() {
|
||||
let mut command = Command::new("cgexec");
|
||||
command.args(["-g", "memory:neon-postgres"]);
|
||||
command.arg(cmd);
|
||||
command
|
||||
} else {
|
||||
Command::new(cmd)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
|
||||
/// that we give to customers
|
||||
fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
@@ -473,7 +451,7 @@ impl ComputeNode {
|
||||
pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let sync_handle = maybe_cgexec(&self.pgbin)
|
||||
let sync_handle = Command::new(&self.pgbin)
|
||||
.args(["--sync-safekeepers"])
|
||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||
.envs(if let Some(storage_auth_token) = &storage_auth_token {
|
||||
@@ -608,7 +586,7 @@ impl ComputeNode {
|
||||
|
||||
// Start postgres
|
||||
info!("starting postgres");
|
||||
let mut pg = maybe_cgexec(&self.pgbin)
|
||||
let mut pg = Command::new(&self.pgbin)
|
||||
.args(["-D", pgdata])
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
@@ -636,7 +614,7 @@ impl ComputeNode {
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Run postgres as a child process.
|
||||
let mut pg = maybe_cgexec(&self.pgbin)
|
||||
let mut pg = Command::new(&self.pgbin)
|
||||
.args(["-D", &self.pgdata])
|
||||
.envs(if let Some(storage_auth_token) = &storage_auth_token {
|
||||
vec![("NEON_AUTH_TOKEN", storage_auth_token)]
|
||||
|
||||
@@ -108,10 +108,12 @@ pub fn get_pg_version(pgbin: &str) -> String {
|
||||
// pg_config --version returns a (platform specific) human readable string
|
||||
// such as "PostgreSQL 15.4". We parse this to v14/v15
|
||||
let human_version = get_pg_config("--version", pgbin);
|
||||
if human_version.contains("15") {
|
||||
return "v15".to_string();
|
||||
} else if human_version.contains("14") {
|
||||
if human_version.contains("14") {
|
||||
return "v14".to_string();
|
||||
} else if human_version.contains("15") {
|
||||
return "v15".to_string();
|
||||
} else if human_version.contains("16") {
|
||||
return "v16".to_string();
|
||||
}
|
||||
panic!("Unsuported postgres version {human_version}");
|
||||
}
|
||||
|
||||
@@ -12,7 +12,6 @@ git-version.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
postgres.workspace = true
|
||||
hyper.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
serde.workspace = true
|
||||
@@ -21,7 +20,6 @@ serde_with.workspace = true
|
||||
tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
toml.workspace = true
|
||||
tokio.workspace = true
|
||||
url.workspace = true
|
||||
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||
# instead, so that recompile times are better.
|
||||
|
||||
@@ -1,104 +0,0 @@
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
use anyhow::anyhow;
|
||||
use pageserver_api::control_api::HexTenantId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{path::PathBuf, process::Child};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
pub struct AttachmentService {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "attachment_service";
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_id: HexTenantId,
|
||||
pub pageserver_id: Option<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookResponse {
|
||||
pub gen: Option<u32>,
|
||||
}
|
||||
|
||||
impl AttachmentService {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = env.base_data_dir.join("attachments.json");
|
||||
|
||||
// Makes no sense to construct this if pageservers aren't going to use it: assume
|
||||
// pageservers have control plane API set
|
||||
let listen_url = env.pageserver.control_plane_api.clone().unwrap();
|
||||
|
||||
let listen = format!(
|
||||
"{}:{}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
Self {
|
||||
env: env.clone(),
|
||||
path,
|
||||
listen,
|
||||
}
|
||||
}
|
||||
|
||||
fn pid_file(&self) -> PathBuf {
|
||||
self.env.base_data_dir.join("attachment_service.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> anyhow::Result<Child> {
|
||||
let path_str = self.path.to_string_lossy();
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
&self.env.attachment_service_bin(),
|
||||
["-l", &self.listen, "-p", &path_str],
|
||||
[],
|
||||
background_process::InitialPidFile::Create(&self.pid_file()),
|
||||
// TODO: a real status check
|
||||
|| Ok(true),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())
|
||||
}
|
||||
|
||||
/// Call into the attach_hook API, for use before handing out attachments to pageservers
|
||||
pub fn attach_hook(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
pageserver_id: NodeId,
|
||||
) -> anyhow::Result<Option<u32>> {
|
||||
use hyper::StatusCode;
|
||||
|
||||
let url = self
|
||||
.env
|
||||
.pageserver
|
||||
.control_plane_api
|
||||
.clone()
|
||||
.unwrap()
|
||||
.join("attach_hook")
|
||||
.unwrap();
|
||||
let client = reqwest::blocking::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client");
|
||||
|
||||
let request = AttachHookRequest {
|
||||
tenant_id: HexTenantId::new(tenant_id),
|
||||
pageserver_id: Some(pageserver_id),
|
||||
};
|
||||
|
||||
let response = client.post(url).json(&request).send()?;
|
||||
if response.status() != StatusCode::OK {
|
||||
return Err(anyhow!("Unexpected status {0}", response.status()));
|
||||
}
|
||||
|
||||
let response = response.json::<AttachHookResponse>()?;
|
||||
Ok(response.gen)
|
||||
}
|
||||
}
|
||||
@@ -1,264 +0,0 @@
|
||||
/// The attachment service mimics the aspects of the control plane API
|
||||
/// that are required for a pageserver to operate.
|
||||
///
|
||||
/// This enables running & testing pageservers without a full-blown
|
||||
/// deployment of the Neon cloud platform.
|
||||
///
|
||||
use anyhow::anyhow;
|
||||
use clap::Parser;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response};
|
||||
use pageserver_api::control_api::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use utils::logging::{self, LogFormat};
|
||||
|
||||
use utils::{
|
||||
http::{
|
||||
endpoint::{self},
|
||||
error::ApiError,
|
||||
json::{json_request, json_response},
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
id::{NodeId, TenantId},
|
||||
tcp_listener,
|
||||
};
|
||||
|
||||
use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
#[command(arg_required_else_help(true))]
|
||||
struct Cli {
|
||||
#[arg(short, long)]
|
||||
listen: String,
|
||||
|
||||
#[arg(short, long)]
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
// The persistent state of each Tenant
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
struct TenantState {
|
||||
// Currently attached pageserver
|
||||
pageserver: Option<NodeId>,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching
|
||||
generation: u32,
|
||||
}
|
||||
|
||||
fn to_hex_map<S, V>(input: &HashMap<TenantId, V>, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
V: Clone + Serialize,
|
||||
{
|
||||
eprintln!("to_hex_map");
|
||||
let transformed = input
|
||||
.iter()
|
||||
.map(|(k, v)| (HexTenantId::new(k.clone()), v.clone()));
|
||||
|
||||
transformed
|
||||
.collect::<HashMap<HexTenantId, V>>()
|
||||
.serialize(serializer)
|
||||
}
|
||||
|
||||
fn from_hex_map<'de, D, V>(deserializer: D) -> Result<HashMap<TenantId, V>, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
V: Deserialize<'de>,
|
||||
{
|
||||
eprintln!("from_hex_map");
|
||||
let hex_map = HashMap::<HexTenantId, V>::deserialize(deserializer)?;
|
||||
|
||||
Ok(hex_map.into_iter().map(|(k, v)| (k.take(), v)).collect())
|
||||
}
|
||||
|
||||
// Top level state available to all HTTP handlers
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PersistentState {
|
||||
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
|
||||
tenants: HashMap<TenantId, TenantState>,
|
||||
|
||||
#[serde(skip)]
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl PersistentState {
|
||||
async fn save(&self) -> anyhow::Result<()> {
|
||||
let bytes = serde_json::to_vec(self)?;
|
||||
tokio::fs::write(&self.path, &bytes).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn load(path: &Path) -> anyhow::Result<Self> {
|
||||
let bytes = tokio::fs::read(path).await?;
|
||||
let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
|
||||
decoded.path = path.to_owned();
|
||||
Ok(decoded)
|
||||
}
|
||||
|
||||
async fn load_or_new(path: &Path) -> Self {
|
||||
match Self::load(path).await {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
tracing::info!(
|
||||
"Creating new state file at {0} (load returned {e})",
|
||||
path.to_string_lossy()
|
||||
);
|
||||
Self {
|
||||
tenants: HashMap::new(),
|
||||
path: path.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
struct State {
|
||||
inner: Arc<tokio::sync::RwLock<PersistentState>>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn new(persistent_state: PersistentState) -> State {
|
||||
Self {
|
||||
inner: Arc::new(tokio::sync::RwLock::new(persistent_state)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn get_state(request: &Request<Body>) -> &State {
|
||||
request
|
||||
.data::<Arc<State>>()
|
||||
.expect("unknown state type")
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
/// Pageserver calls into this on startup, to learn which tenants it should attach
|
||||
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req).inner.clone();
|
||||
let mut locked = state.write().await;
|
||||
|
||||
let mut response = ReAttachResponse {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
for (t, state) in &mut locked.tenants {
|
||||
if state.pageserver == Some(reattach_req.node_id) {
|
||||
state.generation += 1;
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
id: HexTenantId::new(t.clone()),
|
||||
generation: state.generation,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
locked
|
||||
.save()
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e))?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
/// Pageserver calls into this before doing deletions, to confirm that it still
|
||||
/// holds the latest generation for the tenants with deletions enqueued
|
||||
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req).inner.clone();
|
||||
let locked = state.read().await;
|
||||
|
||||
let mut response = ValidateResponse {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
|
||||
for req_tenant in validate_req.tenants {
|
||||
if let Some(tenant_state) = locked.tenants.get(req_tenant.id.as_ref()) {
|
||||
let valid = tenant_state.generation == req_tenant.gen;
|
||||
response.tenants.push(ValidateResponseTenant {
|
||||
id: req_tenant.id,
|
||||
valid,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
|
||||
/// (in the real control plane this is unnecessary, because the same program is managing
|
||||
/// generation numbers and doing attachments).
|
||||
async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req).inner.clone();
|
||||
let mut locked = state.write().await;
|
||||
|
||||
let tenant_state = locked
|
||||
.tenants
|
||||
.entry(attach_req.tenant_id.take())
|
||||
.or_insert_with(|| TenantState {
|
||||
pageserver: attach_req.pageserver_id,
|
||||
generation: 0,
|
||||
});
|
||||
|
||||
if attach_req.pageserver_id.is_some() {
|
||||
tenant_state.generation += 1;
|
||||
}
|
||||
let generation = tenant_state.generation;
|
||||
|
||||
locked
|
||||
.save()
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e))?;
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
AttachHookResponse {
|
||||
gen: attach_req.pageserver_id.map(|_| generation),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
endpoint::make_router()
|
||||
.data(Arc::new(State::new(persistent_state)))
|
||||
.post("/re-attach", |r| handle_re_attach(r))
|
||||
.post("/validate", |r| handle_validate(r))
|
||||
.post("/attach_hook", |r| handle_attach_hook(r))
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
)?;
|
||||
|
||||
let args = Cli::parse();
|
||||
tracing::info!(
|
||||
"Starting, state at {}, listening on {}",
|
||||
args.path.to_string_lossy(),
|
||||
args.listen
|
||||
);
|
||||
|
||||
let persistent_state = PersistentState::load_or_new(&args.path).await;
|
||||
|
||||
let http_listener = tcp_listener::bind(&args.listen)?;
|
||||
let router = make_router(persistent_state)
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
|
||||
|
||||
tracing::info!("Serving on {0}", args.listen.as_str());
|
||||
server.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -8,7 +8,6 @@
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::attachment_service::AttachmentService;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
@@ -44,8 +43,6 @@ project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: &str = "15";
|
||||
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
|
||||
|
||||
fn default_conf() -> String {
|
||||
format!(
|
||||
r#"
|
||||
@@ -59,13 +56,11 @@ listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
|
||||
listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
|
||||
pg_auth_type = '{trust_auth}'
|
||||
http_auth_type = '{trust_auth}'
|
||||
control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
|
||||
|
||||
[[safekeepers]]
|
||||
id = {DEFAULT_SAFEKEEPER_ID}
|
||||
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
|
||||
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
|
||||
|
||||
"#,
|
||||
trust_auth = AuthType::Trust,
|
||||
)
|
||||
@@ -112,7 +107,6 @@ fn main() -> Result<()> {
|
||||
"start" => handle_start_all(sub_args, &env),
|
||||
"stop" => handle_stop_all(sub_args, &env),
|
||||
"pageserver" => handle_pageserver(sub_args, &env),
|
||||
"attachment_service" => handle_attachment_service(sub_args, &env),
|
||||
"safekeeper" => handle_safekeeper(sub_args, &env),
|
||||
"endpoint" => handle_endpoint(sub_args, &env),
|
||||
"pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
|
||||
@@ -348,25 +342,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
}
|
||||
}
|
||||
Some(("create", create_match)) => {
|
||||
let initial_tenant_id = parse_tenant_id(create_match)?;
|
||||
let tenant_conf: HashMap<_, _> = create_match
|
||||
.get_many::<String>("config")
|
||||
.map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
// If tenant ID was not specified, generate one
|
||||
let tenant_id = parse_tenant_id(create_match)?.unwrap_or(TenantId::generate());
|
||||
|
||||
let generation = if env.pageserver.control_plane_api.is_some() {
|
||||
// We must register the tenant with the attachment service, so
|
||||
// that when the pageserver restarts, it will be re-attached.
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service.attach_hook(tenant_id, env.pageserver.id)?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
|
||||
println!("tenant {tenant_id} successfully created on the pageserver");
|
||||
let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?;
|
||||
println!("tenant {new_tenant_id} successfully created on the pageserver");
|
||||
|
||||
// Create an initial timeline for the new tenant
|
||||
let new_timeline_id = parse_timeline_id(create_match)?;
|
||||
@@ -376,7 +358,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let timeline_info = pageserver.timeline_create(
|
||||
tenant_id,
|
||||
new_tenant_id,
|
||||
new_timeline_id,
|
||||
None,
|
||||
None,
|
||||
@@ -387,17 +369,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
|
||||
env.register_branch_mapping(
|
||||
DEFAULT_BRANCH_NAME.to_string(),
|
||||
tenant_id,
|
||||
new_tenant_id,
|
||||
new_timeline_id,
|
||||
)?;
|
||||
|
||||
println!(
|
||||
"Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
|
||||
"Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
|
||||
);
|
||||
|
||||
if create_match.get_flag("set-default") {
|
||||
println!("Setting tenant {tenant_id} as a default one");
|
||||
env.default_tenant_id = Some(tenant_id);
|
||||
println!("Setting tenant {new_tenant_id} as a default one");
|
||||
env.default_tenant_id = Some(new_tenant_id);
|
||||
}
|
||||
}
|
||||
Some(("set-default", set_default_match)) => {
|
||||
@@ -835,33 +817,6 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let svc = AttachmentService::from_env(env);
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", _start_match)) => {
|
||||
if let Err(e) = svc.start() {
|
||||
eprintln!("start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Some(("stop", stop_match)) => {
|
||||
let immediate = stop_match
|
||||
.get_one::<String>("stop-mode")
|
||||
.map(|s| s.as_str())
|
||||
== Some("immediate");
|
||||
|
||||
if let Err(e) = svc.stop(immediate) {
|
||||
eprintln!("stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
|
||||
None => bail!("no attachment_service subcommand provided"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNode> {
|
||||
if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) {
|
||||
Ok(SafekeeperNode::from_env(env, node))
|
||||
@@ -942,16 +897,6 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
|
||||
|
||||
broker::start_broker_process(env)?;
|
||||
|
||||
// Only start the attachment service if the pageserver is configured to need it
|
||||
if env.pageserver.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.start() {
|
||||
eprintln!("attachment_service start failed: {:#}", e);
|
||||
try_stop_all(env, true);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
let pageserver = PageServerNode::from_env(env);
|
||||
if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
|
||||
eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
|
||||
@@ -1010,13 +955,6 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
if let Err(e) = broker::stop_broker_process(env) {
|
||||
eprintln!("neon broker stop failed: {e:#}");
|
||||
}
|
||||
|
||||
if env.pageserver.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.stop(immediate) {
|
||||
eprintln!("attachment service stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn cli() -> Command {
|
||||
@@ -1200,14 +1138,6 @@ fn cli() -> Command {
|
||||
.arg(stop_mode_arg.clone()))
|
||||
.subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("attachment_service")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage attachment_service")
|
||||
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
||||
.subcommand(Command::new("stop").about("Stop local pageserver")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("safekeeper")
|
||||
.arg_required_else_help(true)
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
// local installations.
|
||||
//
|
||||
|
||||
pub mod attachment_service;
|
||||
mod background_process;
|
||||
pub mod broker;
|
||||
pub mod endpoint;
|
||||
|
||||
@@ -118,9 +118,6 @@ pub struct PageServerConf {
|
||||
// auth type used for the PG and HTTP ports
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
|
||||
// Control plane location
|
||||
pub control_plane_api: Option<Url>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -131,7 +128,6 @@ impl Default for PageServerConf {
|
||||
listen_http_addr: String::new(),
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
control_plane_api: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -206,10 +202,6 @@ impl LocalEnv {
|
||||
self.neon_distrib_dir.join("pageserver")
|
||||
}
|
||||
|
||||
pub fn attachment_service_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("attachment_service")
|
||||
}
|
||||
|
||||
pub fn safekeeper_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("safekeeper")
|
||||
}
|
||||
|
||||
@@ -126,13 +126,6 @@ impl PageServerNode {
|
||||
broker_endpoint_param,
|
||||
];
|
||||
|
||||
if let Some(control_plane_api) = &self.env.pageserver.control_plane_api {
|
||||
overrides.push(format!(
|
||||
"control_plane_api='{}'",
|
||||
control_plane_api.as_str()
|
||||
));
|
||||
}
|
||||
|
||||
if self.env.pageserver.http_auth_type != AuthType::Trust
|
||||
|| self.env.pageserver.pg_auth_type != AuthType::Trust
|
||||
{
|
||||
@@ -323,8 +316,7 @@ impl PageServerNode {
|
||||
|
||||
pub fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: TenantId,
|
||||
generation: Option<u32>,
|
||||
new_tenant_id: Option<TenantId>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<TenantId> {
|
||||
let mut settings = settings.clone();
|
||||
@@ -390,9 +382,11 @@ impl PageServerNode {
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
};
|
||||
|
||||
// If tenant ID was not specified, generate one
|
||||
let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate());
|
||||
|
||||
let request = models::TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
generation,
|
||||
config,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
|
||||
@@ -1,957 +0,0 @@
|
||||
# Pageserver: split-brain safety for remote storage through generation numbers
|
||||
|
||||
## Summary
|
||||
|
||||
A scheme of logical "generation numbers" for tenant attachment to pageservers is proposed, along with
|
||||
changes to the remote storage format to include these generation numbers in S3 keys.
|
||||
|
||||
Using the control plane as the issuer of these generation numbers enables strong anti-split-brain
|
||||
properties in the pageserver cluster without implementing a consensus mechanism directly
|
||||
in the pageservers.
|
||||
|
||||
## Motivation
|
||||
|
||||
Currently, the pageserver's remote storage format does not provide a mechanism for addressing
|
||||
split brain conditions that may happen when replacing a node or when migrating
|
||||
a tenant from one pageserver to another.
|
||||
|
||||
From a remote storage perspective, a split brain condition occurs whenever two nodes both think
|
||||
they have the same tenant attached, and both can write to S3. This can happen in the case of a
|
||||
network partition, pathologically long delays (e.g. suspended VM), or software bugs.
|
||||
|
||||
In the current deployment model, control plane guarantees that a tenant is attached to one
|
||||
pageserver at a time, thereby ruling out split-brain conditions resulting from dual
|
||||
attachment (however, there is always the risk of a control plane bug). This control
|
||||
plane guarantee prevents robust response to failures, as if a pageserver is unresponsive
|
||||
we may not detach from it. The mechanism in this RFC fixes this, by making it safe to
|
||||
attach to a new, different pageserver even if an unresponsive pageserver may be running.
|
||||
|
||||
Futher, lack of safety during split-brain conditions blocks two important features where occasional
|
||||
split-brain conditions are part of the design assumptions:
|
||||
|
||||
- seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029))
|
||||
- automatic pageserver instance failure handling (aka "failover") (RFC TBD)
|
||||
|
||||
### Prior art
|
||||
|
||||
- 020-pageserver-s3-coordination.md
|
||||
- 023-the-state-of-pageserver-tenant-relocation.md
|
||||
- 026-pageserver-s3-mvcc.md
|
||||
|
||||
This RFC has broad similarities to the proposal to implement a MVCC scheme in
|
||||
S3 object names, but this RFC avoids a general purpose transaction scheme in
|
||||
favour of more specialized "generations" that work like a transaction ID that
|
||||
always has the same lifetime as a pageserver process or tenant attachment, whichever
|
||||
is shorter.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Accommodate storage backends with no atomic or fencing capability (i.e. work within
|
||||
S3's limitation that there are no atomics and clients can't be fenced)
|
||||
- Don't depend on any STONITH or node fencing in the compute layer (i.e. we will not
|
||||
assume that we can reliably kill and EC2 instance and have it die)
|
||||
- Scoped per-tenant, not per-pageserver; for _seamless tenant migration_, we need
|
||||
per-tenant granularity, and for _failover_, we likely want to spread the workload
|
||||
of the failed pageserver instance to a number of peers, rather than monolithically
|
||||
moving the entire workload to another machine.
|
||||
We do not rule out the latter case, but should not constrain ourselves to it.
|
||||
|
||||
## Design Tenets
|
||||
|
||||
These are not requirements, but are ideas that guide the following design:
|
||||
|
||||
- Avoid implementing another consensus system: we already have a strongly consistent
|
||||
database in the control plane that can do atomic operations where needed, and we also
|
||||
have a Paxos implementation in the safekeeper.
|
||||
- Avoiding locking in to specific models of how failover will work (e.g. do not assume that
|
||||
all the tenants on a pageserver will fail over as a unit).
|
||||
- Be strictly correct when it comes to data integrity. Occasional failures of availability
|
||||
are tolerable, occasional data loss is not.
|
||||
|
||||
## Non Goals
|
||||
|
||||
The changes in this RFC intentionally isolate the design decision of how to define
|
||||
logical generations numbers and object storage format in a way that is somewhat flexible with
|
||||
respect to how actual orchestration of failover works.
|
||||
|
||||
This RFC intentionally does not cover:
|
||||
|
||||
- Failure detection
|
||||
- Orchestration of failover
|
||||
- Standby modes to keep data ready for fast migration
|
||||
- Intentional multi-writer operation on tenants (multi-writer scenarios are assumed to be transient split-brain situations).
|
||||
- Sharding.
|
||||
|
||||
The interaction between this RFC and those features is discussed in [Appendix B](#appendix-b-interoperability-with-other-features)
|
||||
|
||||
## Impacted Components
|
||||
|
||||
pageserver, control plane, safekeeper (optional)
|
||||
|
||||
## Implementation Part 1: Correctness
|
||||
|
||||
### Summary
|
||||
|
||||
- A per-tenant **generation number** is introduced to uniquely identifying tenant attachments to pageserver processes.
|
||||
|
||||
- This generation number increments each time the control plane modifies a tenant (`Project`)'s assigned pageserver, or when the assigned pageserver restarts.
|
||||
- the control plane is the authority for generation numbers: only it may
|
||||
increment a generation number.
|
||||
|
||||
- **Object keys are suffixed** with the generation number
|
||||
- **Safety for multiply-attached tenants** is provided by the
|
||||
generation number in the object key: the competing pageservers will not
|
||||
try to write to the same keys.
|
||||
- **Safety in split brain for multiple nodes running with
|
||||
the same node ID** is provided by the pageserver calling out to the control plane
|
||||
on startup, to re-attach and thereby increment the generations of any attached tenants
|
||||
- **Safety for deletions** is achieved by deferring the DELETE from S3 to a point in time where the deleting node has validated with control plane that no attachment with a higher generation has a reference to the to-be-DELETEd key.
|
||||
- **The control plane is used to issue generation numbers** to avoid the need for
|
||||
a built-in consensus system in the pageserver, although this could in principle
|
||||
be changed without changing the storage format.
|
||||
|
||||
### Generation numbers
|
||||
|
||||
A generation number is associated with each tenant in the control plane,
|
||||
and each time the attachment status of the tenant changes, this is incremented.
|
||||
Changes in attachment status include:
|
||||
|
||||
- Attaching the tenant to a different pageserver
|
||||
- A pageserver restarting, and "re-attaching" its tenants on startup
|
||||
|
||||
These increments of attachment generation provide invariants we need to avoid
|
||||
split-brain issues in storage:
|
||||
|
||||
- If two pageservers have the same tenant attached, the attachments are guaranteed to have different generation numbers, because the generation would increment
|
||||
while attaching the second one.
|
||||
- If there are multiple pageservers running with the same node ID, all the attachments on all pageservers are guaranteed to have different generation numbers, because the generation would increment
|
||||
when the second node started and re-attached its tenants.
|
||||
|
||||
As long as the infrastructure does not transparently replace an underlying
|
||||
physical machine, we are totally safe. See the later [unsafe case](#unsafe-case-on-badly-behaved-infrastructure) section for details.
|
||||
|
||||
### Object Key Changes
|
||||
|
||||
#### Generation suffix
|
||||
|
||||
All object keys (layer objects and index objects) will contain the attachment
|
||||
generation as a [suffix](#why-a-generation-suffix-rather-than-prefix).
|
||||
This suffix is the primary mechanism for protecting against split-brain situations, and
|
||||
enabling safe multi-attachment of tenants:
|
||||
|
||||
- Two pageservers running with the same node ID (e.g. after a failure, where there is
|
||||
some rogue pageserver still running) will not try to write to the same objects, because at startup they will have re-attached tenants and thereby incremented
|
||||
generation numbers.
|
||||
- Multiple attachments (to different pageservers) of the same tenant will not try to write to the same objects, as each attachment would have a distinct generation.
|
||||
|
||||
The generation is appended in hex format (8 byte string representing
|
||||
u32), to all our existing key names. A u32's range limit would permit
|
||||
27 restarts _per second_ over a 5 year system lifetime: orders of magnitude more than
|
||||
is realistic.
|
||||
|
||||
The exact meaning of the generation suffix can evolve over time if necessary, for
|
||||
example if we chose to implement a failover mechanism internally to the pageservers
|
||||
rather than going via the control plane. The storage format just sees it as a number,
|
||||
with the only semantic property being that the highest numbered index is the latest.
|
||||
|
||||
#### Index changes
|
||||
|
||||
Since object keys now include a generation suffix, the index of these keys must also be updated. IndexPart currently stores keys and LSNs sufficient to reconstruct key names: this would be extended to store the generation as well.
|
||||
|
||||
This will increase the size of the file, but only modestly: layers are already encoded as
|
||||
their string-ized form, so the overhead is about 10 bytes per layer. This will be less if/when
|
||||
the index storage format is migrated to a binary format from JSON.
|
||||
|
||||
#### Visibility
|
||||
|
||||
_This section doesn't describe code changes, but extends on the consequences of the
|
||||
object key changes given above_
|
||||
|
||||
##### Visibility of objects to pageservers
|
||||
|
||||
Pageservers can of course list objects in S3 at any time, but in practice their
|
||||
visible set is based on the contents of their LayerMap, which is initialized
|
||||
from the `index_part.json.???` that they load.
|
||||
|
||||
Starting with the `index_part` from the most recent previous generation
|
||||
(see [loading index_part](#finding-the-remote-indices-for-timelines)), a pageserver
|
||||
initially has visibility of all the objects that were referenced in the loaded index.
|
||||
These objects are guaranteed to remain visible until the current generation is
|
||||
superseded, via pageservers in older generations avoiding deletions (see [deletion](#deletion)).
|
||||
|
||||
The "most recent previous generation" is _not_ necessarily the most recent
|
||||
in terms of walltime, it is the one that is readable at the time a new generation
|
||||
starts. Consider the following sequence of a tenant being re-attached to different
|
||||
pageserver nodes:
|
||||
|
||||
- Create + attach on PS1 in generation 1
|
||||
- PS1 Do some work, write out index_part.json-0001
|
||||
- Attach to PS2 in generation 2
|
||||
- Read index_part.json-0001
|
||||
- PS2 starts doing some work...
|
||||
- Attach to PS3 in generation 3
|
||||
- Read index_part.json-0001
|
||||
- **...PS2 finishes its work: now it writes index_part.json-0002**
|
||||
- PS3 writes out index_part.json-0003
|
||||
|
||||
In the above sequence, the ancestry of indices is:
|
||||
|
||||
```
|
||||
0001 -> 0002
|
||||
|
|
||||
-> 0003
|
||||
```
|
||||
|
||||
This is not an issue for safety: if the 0002 references some object that is
|
||||
not in 0001, then 0003 simply does not see it, and will re-do whatever
|
||||
work was required (e.g. ingesting WAL or doing compaction). Objects referenced
|
||||
by only the 0002 index will never be read by future attachment generations, and
|
||||
will eventually be cleaned up by a scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)).
|
||||
|
||||
##### Visibility of LSNs to clients
|
||||
|
||||
Because index_part.json is now written with a generation suffix, which data
|
||||
is visible depends on which generation the reader is operating in:
|
||||
|
||||
- If one was passively reading from S3 from outside of a pageserver, the
|
||||
visibility of data would depend on which index_part.json-<generation> file
|
||||
one had chosen to read from.
|
||||
- If two pageservers have the same tenant attached, they may have different
|
||||
data visible as they're independently replaying the WAL, and maintaining
|
||||
independent LayerMaps that are written to independent index_part.json files.
|
||||
Data does not have to be remotely committed to be visible.
|
||||
- For a pageserver writing with a stale generation, historic LSNs
|
||||
remain readable until another pageserver (with a higher generation suffix)
|
||||
decides to execute GC deletions. At this point, we may think of the stale
|
||||
attachment's generation as having logically ended: during its existence
|
||||
the generation had a consistent view of the world.
|
||||
- For a newly attached pageserver, its highest visible LSN may appears to
|
||||
go backwards with respect to an earlier attachment, if that earlier
|
||||
attachment had not uploaded all data to S3 before the new attachment.
|
||||
|
||||
### Deletion
|
||||
|
||||
#### Generation number validation
|
||||
|
||||
While writes are de-conflicted by writers always using their own generation number in the key,
|
||||
deletions are slightly more challenging: if a pageserver A is isolated, and the true active node is
|
||||
pageserver B, then it is dangerous for A to do any object deletions, even of objects that it wrote
|
||||
itself, because pageserver's B metadata might reference those objects.
|
||||
|
||||
We solve this by inserting a "generation validation" step between the write of a remote index
|
||||
that un-links a particular object from the index, and the actual deletion of the object, such
|
||||
that deletions strictly obey the following ordering:
|
||||
|
||||
1. Write out index_part.json: this guarantees that any subsequent reader of the metadata will
|
||||
not try and read the object we unlinked.
|
||||
2. Call out to control plane to validate that the generation which we use for our attachment is still the latest.
|
||||
3. If step 2 passes, it is safe to delete the object. Why? The check-in with control plane
|
||||
together with our visibility rules guarantees that any later generation
|
||||
will use either the exact `index_part.json` that we uploaded in step 1, or a successor
|
||||
of it; not an earlier one. In both cases, the `index_part.json` doesn't reference the
|
||||
key we are deleting anymore, so, the key is invisible to any later attachment generation.
|
||||
Hence it's safe to delete it.
|
||||
|
||||
Note that at step 2 we are only confirming that deletions of objects _no longer referenced
|
||||
by the specific `index_part.json` written in step 1_ are safe. If we were attempting other deletions concurrently,
|
||||
these would need their own generation validation step.
|
||||
|
||||
If step 2 fails, we may leak the object. This is safe, but has a cost: see [scrubbing](#cleaning-up-orphan-objects-scrubbing). We may avoid this entirely outside of node
|
||||
failures, if we do proper flushing of deletions on clean shutdown and clean migration.
|
||||
|
||||
To avoid doing a huge number of control plane requests to perform generation validation,
|
||||
validation of many tenants will be done in a single request, and deletions will be queued up
|
||||
prior to validation: see [Persistent deletion queue](#persistent-deletion-queue) for more.
|
||||
|
||||
#### `remote_consistent_lsn` updates
|
||||
|
||||
Remote objects are not the only kind of deletion the pageserver does: it also indirectly deletes
|
||||
WAL data, by feeding back remote_consistent_lsn to safekeepers, as a signal to the safekeepers that
|
||||
they may drop data below this LSN.
|
||||
|
||||
For the same reasons that deletion of objects must be guarded by an attachment generation number
|
||||
validation step, updates to `remote_consistent_lsn` are subject to the same rules, using
|
||||
an ordering as follows:
|
||||
|
||||
1. upload the index_part that covers data up to LSN `L0` to S3
|
||||
2. Call out to control plane to validate that the generation which we use for our attachment is still the latest.
|
||||
3. advance the `remote_consistent_lsn` that we advertise to the safekeepers to `L0`
|
||||
|
||||
If step 2 fails, then the `remote_consistent_lsn` advertised
|
||||
to safekeepers will not advance again until a pageserver
|
||||
with the latest generation is ready to do so.
|
||||
|
||||
**Note:** at step 3 we are not advertising the _latest_ remote_consistent_lsn, we are
|
||||
advertising the value in the index_part that we uploaded in step 1. This provides
|
||||
a strong ordering guarantee.
|
||||
|
||||
Internally to the pageserver, each timeline will have two remote_consistent_lsn values: the one that
|
||||
reflects its latest write to remote storage, and the one that reflects the most
|
||||
recent validation of generation number. It is only the latter value that may
|
||||
be advertised to the outside world (i.e. to the safekeeper).
|
||||
|
||||
The control plane remains unaware of `remote_consistent_lsn`: it only has to validate
|
||||
the freshness of generation numbers, thereby granting the pageserver permission to
|
||||
share the information with the safekeeper.
|
||||
|
||||
For convenience, in subsequent sections and RFCs we will use "deletion" to mean both deletion
|
||||
of objects in S3, and updates to the `remote_consistent_lsn`, as updates to the remote consistent
|
||||
LSN are de-facto deletions done via the safekeeper, and both kinds of deletion are subject to
|
||||
the same generation validation requirement.
|
||||
|
||||
### Pageserver attach/startup changes
|
||||
|
||||
#### Attachment
|
||||
|
||||
Calls to `/v1/tenant/{tenant_id}/attach` are augmented with an additional
|
||||
`generation` field in the body.
|
||||
|
||||
The pageserver does not persist this: a generation is only good for the lifetime
|
||||
of a process.
|
||||
|
||||
#### Finding the remote indices for timelines
|
||||
|
||||
Because index files are now suffixed with generation numbers, the pageserver
|
||||
cannot always GET the remote index in one request, because it can't always
|
||||
know a-priori what the latest remote index is.
|
||||
|
||||
Typically, the most recent generation to write an index would be our own
|
||||
generation minus 1. However, this might not be the case: the previous
|
||||
node might have started and acquired a generation number, and then crashed
|
||||
before writing out a remote index.
|
||||
|
||||
In the general case and as a fallback, the pageserver may list all the `index_part.json`
|
||||
files for a timeline, sort them by generation, and pick the highest that is `<=`
|
||||
its current generation for this attachment. The tenant should never load an index
|
||||
with an attachment generation _newer_ than its own.
|
||||
These two rules combined ensure that objects written by later generations are never visible to earlier generations.
|
||||
|
||||
Note that if a given attachment picks an index part from an earlier generation (say n-2), but crashes & restarts before it writes its own generation's index part, next time it tries to pick an index part there may be an index part from generation n-1.
|
||||
It would pick the n-1 index part in that case, because it's sorted higher than the previous one from generation n-2.
|
||||
So, above rules guarantee no determinism in selecting the index part.
|
||||
are allowed to be attached with stale attachment generations during a multiply-attached
|
||||
phase in a migration, and in this instance if the old location's pageserver restarts,
|
||||
it should not try and load the newer generation's index.
|
||||
|
||||
To summarize, on starting a timeline, the pageserver will:
|
||||
|
||||
1. Issue a GET for index_part.json-<my generation - 1>
|
||||
2. If 1 failed, issue a ListObjectsv2 request for index_part.json\* and
|
||||
pick the newest.
|
||||
|
||||
One could optimize this further by using the control plane to record specifically
|
||||
which generation most recently wrote an index_part.json, if necessary, to increase
|
||||
the probability of finding the index_part.json in one GET. One could also improve
|
||||
the chances by having pageservers proactively write out index_part.json after they
|
||||
get a new generation ID.
|
||||
|
||||
#### Re-attachment on startup
|
||||
|
||||
On startup, the pageserver will call out to an new control plane `/re-attach`
|
||||
API (see [Generation API](#generation-api)). This returns a list of
|
||||
tenants that should be attached to the pageserver, and their generation numbers, which
|
||||
the control plane will increment before returning.
|
||||
|
||||
The pageserver should still scan its local disk on startup, but should _delete_
|
||||
any local content for tenants not indicated in the `/re-attach` response: their
|
||||
absence is an implicit detach operation.
|
||||
|
||||
**Note** if a tenant is omitted from the re-attach response, its local disk content
|
||||
will be deleted. This will change in subsequent work, when the control plane gains
|
||||
the concept of a secondary/standby location: a node with local content may revert
|
||||
to this status and retain some local content.
|
||||
|
||||
#### Cleaning up previous generations' remote indices
|
||||
|
||||
Deletion of old indices is not necessary for correctness, although it is necessary
|
||||
to avoid the ListObjects fallback in the previous section becoming ever more expensive.
|
||||
|
||||
Once the new attachment has written out its index_part.json, it may asynchronously clean up historic index_part.json
|
||||
objects that were found.
|
||||
|
||||
We may choose to implement this deletion either as an explicit step after we
|
||||
write out index_part for the first time in a pageserver's lifetime, or for
|
||||
simplicity just do it periodically as part of the background scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing));
|
||||
|
||||
### Control Plane Changes
|
||||
|
||||
#### Store generations for attaching tenants
|
||||
|
||||
- The `Project` table must store the generation number for use when
|
||||
attaching the tenant to a new pageserver.
|
||||
- The `/v1/tenant/:tenant_id/attach` pageserver API will require the generation number,
|
||||
which the control plane can supply by simply incrementing the `Project`'s
|
||||
generation number each time the tenant is attached to a different server: the same database
|
||||
transaction that changes the assigned pageserver should also change the generation number.
|
||||
|
||||
#### Generation API
|
||||
|
||||
This section describes an API that could be provided directly by the control plane,
|
||||
or built as a separate microservice. In earlier parts of the RFC, when we
|
||||
discuss the control plane providing generation numbers, we are referring to this API.
|
||||
|
||||
The API endpoints used by the pageserver to acquire and validate generation
|
||||
numbers are quite simple, and only require access to some persistent and
|
||||
linerizable storage (such as a database).
|
||||
|
||||
Building this into the control plane is proposed as a least-effort option to exploit existing infrastructure and implement generation number issuance in the same transaction that mandates it (i.e., the transaction that updates the `Project` assignment to another pageserver).
|
||||
However, this is not mandatory: this "Generation Number Issuer" could
|
||||
be built as a microservice. In practice, we will write such a miniature service
|
||||
anyway, to enable E2E pageserver/compute testing without control plane.
|
||||
|
||||
The endpoints required by pageservers are:
|
||||
|
||||
##### `/re-attach`
|
||||
|
||||
- Request: `{node_id: <u32>}`
|
||||
- Response:
|
||||
- 200 `{tenants: [{id: <TenantId>, gen: <u32>}]}`
|
||||
- 404: unknown node_id
|
||||
- (Future: 429: flapping detected, perhaps nodes are fighting for the same node ID,
|
||||
or perhaps this node was in a retry loop)
|
||||
- (On unknown tenants, omit tenant from `tenants` array)
|
||||
- Server behavior: query database for which tenants should be attached to this pageserver.
|
||||
- for each tenant that should be attached, increment the attachment generation and
|
||||
include the new generation in the response
|
||||
- Client behavior:
|
||||
- for all tenants in the response, activate with the new generation number
|
||||
- for any local disk content _not_ referenced in the response, act as if we
|
||||
had been asked to detach it (i.e. delete local files)
|
||||
|
||||
**Note** the `node_id` in this request will change in future if we move to ephemeral
|
||||
node IDs, to be replaced with some correlation ID that helps the control plane realize
|
||||
if a process is running with the same storage as a previous pageserver process (e.g.
|
||||
we might use EC instance ID, or we might just write some UUID to the disk the first
|
||||
time we use it)
|
||||
|
||||
##### `/validate`
|
||||
|
||||
- Request: `{'tenants': [{tenant: <tenant id>, attach_gen: <gen>}, ...]}'`
|
||||
- Response:
|
||||
- 200 `{'tenants': [{tenant: <tenant id>, status: <bool>}...]}`
|
||||
- (On unknown tenants, omit tenant from `tenants` array)
|
||||
- Purpose: enable the pageserver to discover for the given attachments whether they are still the latest.
|
||||
- Server behavior: this is a read-only operation: simply compare the generations in the request with
|
||||
the generations known to the server, and set status to `true` if they match.
|
||||
- Client behavior: clients must not do deletions within a tenant's remote data until they have
|
||||
received a response indicating the generation they hold for the attachment is current.
|
||||
|
||||
#### Use of `/load` and `/ignore` APIs
|
||||
|
||||
Because the pageserver will be changed to only attach tenants on startup
|
||||
based on the control plane's response to a `/re-attach` request, the load/ignore
|
||||
APIs no longer make sense in their current form.
|
||||
|
||||
The `/load` API becomes functionally equivalent to attach, and will be removed:
|
||||
any location that used `/load` before should just attach instead.
|
||||
|
||||
The `/ignore` API is equivalent to detaching, but without deleting local files.
|
||||
|
||||
### Timeline/Branch creation & deletion
|
||||
|
||||
All of the previous arguments for safety have described operations within
|
||||
a timeline, where we may describe a sequence that includes updates to
|
||||
index_part.json, and where reads and writes are coming from a postgres
|
||||
endpoint (writes via the safekeeper).
|
||||
|
||||
Creating or destroying timeline is a bit different, because writes
|
||||
are coming from the control plane.
|
||||
|
||||
We must be safe against scenarios such as:
|
||||
|
||||
- A tenant is attached to pageserver B while pageserver A is
|
||||
in the middle of servicing an RPC from the control plane to
|
||||
create or delete a tenant.
|
||||
- A pageserver A has been sent a timeline creation request
|
||||
but becomes unresponsive. The tenant is attached to a
|
||||
different pageserver B, and the timeline creation request
|
||||
is sent there too.
|
||||
|
||||
#### Timeline Creation
|
||||
|
||||
If some very slow node tries to do a timeline creation _after_
|
||||
a more recent generation node has already created the timeline
|
||||
and written some data into it, that must not cause harm. This
|
||||
is provided in timeline creations by the way all the objects
|
||||
within the timeline's remote path include a generation suffix:
|
||||
a slow node in an old generation that attempts to "create" a timeline
|
||||
that already exists will just emit an index_part.json with
|
||||
an old generation suffix.
|
||||
|
||||
Timeline IDs are never reused, so we don't have
|
||||
to worry about the case of create/delete/create cycles. If they
|
||||
were re-used during a disaster recovery "un-delete" of a timeline,
|
||||
that special case can be handled by calling out to all available pageservers
|
||||
to check that they return 404 for the timeline, and to flush their
|
||||
deletion queues in case they had any deletions pending from the
|
||||
timeline.
|
||||
|
||||
The above makes it safe for control plane to change the assignment of
|
||||
tenant to pageserver in control plane while a timeline creation is ongoing.
|
||||
The reason is that the creation request against the new assigned pageserver
|
||||
uses a new generation number. However, care must be taken by control plane
|
||||
to ensure that a "timeline creation successul" response from some pageserver
|
||||
is checked for the pageserver's generation for that timeline's tenant still being the latest.
|
||||
If it is not the latest, the response does not constitute a successful timeline creation.
|
||||
It is acceptable to discard such responses, the scrubber will clean up the S3 state.
|
||||
It is better to issue a timelien deletion request to the stale attachment.
|
||||
|
||||
#### Timeline Deletion
|
||||
|
||||
Tenant/timeline deletion operations are exempt from generation validation
|
||||
on deletes, and therefore don't have to go through the same deletion
|
||||
queue as GC/compaction layer deletions. This is because once a
|
||||
delete is issued by the control plane, it is a promise that the
|
||||
control plane will keep trying until the deletion is done, so even stale
|
||||
pageservers are permitted to go ahead and delete the objects.
|
||||
|
||||
The implications of this for control plane are:
|
||||
|
||||
- During timeline/tenant deletion, the control plane must wait for the deletion to
|
||||
be truly complete (status 404) and also handle the case where the pageserver
|
||||
becomes unavailable, either by waiting for a replacement with the same node_id,
|
||||
or by *re-attaching the tenant elsewhere.
|
||||
|
||||
- The control plane must persist its intent to delete
|
||||
a timeline/tenant before issuing any RPCs, and then once it starts, it must
|
||||
keep retrying until the tenant/timeline is gone. This is already handled
|
||||
by using a persistent `Operation` record that is retried indefinitely.
|
||||
|
||||
Timeline deletion may result in a special kind of object leak, where
|
||||
the latest generation attachment completes a deletion (including erasing
|
||||
all objects in the timeline path), but some slow/partitioned node is
|
||||
writing into the timeline path with a stale generation number. This would
|
||||
not be caught by any per-timeline scrubbing (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)), since scrubbing happens on the
|
||||
attached pageserver, and once the timeline is deleted it isn't attached anywhere.
|
||||
This scenario should be pretty rare, and the control plane can make it even
|
||||
rarer by ensuring that if a tenant is in a multi-attached state (e.g. during
|
||||
migration), we wait for that to complete before processing the deletion. Beyond
|
||||
that, we may implement some other top-level scrub of timelines in
|
||||
an external tool, to identify any tenant/timeline paths that are not found
|
||||
in the control plane database.
|
||||
|
||||
#### Examples
|
||||
|
||||
- Deletion, node restarts partway through:
|
||||
- By the time we returned 202, we have written a remote delete marker
|
||||
- Any subsequent incarnation of the same node_id will see the remote
|
||||
delete marker and continue to process the deletion
|
||||
- If the original pageserver is lost permanently and no replacement
|
||||
with the same node_id is available, then the control plane must recover
|
||||
by re-attaching the tenant to a different node.
|
||||
- Creation, node becomes unresponsive partway through.
|
||||
- Control plane will see HTTP request timeout, keep re-issuing
|
||||
request to whoever is the latest attachment point for the tenant
|
||||
until it succeeds.
|
||||
- Stale nodes may be trying to execute timeline creation: they will
|
||||
write out index_part.json files with
|
||||
stale attachment generation: these will be eventually cleaned up
|
||||
by the same mechanism as other old indices.
|
||||
|
||||
### Unsafe case on badly behaved infrastructure
|
||||
|
||||
This section is only relevant if running on a different environment
|
||||
than EC2 machines with ephemeral disks.
|
||||
|
||||
If we ever run pageservers on infrastructure that might transparently restart
|
||||
a pageserver while leaving an old process running (e.g. a VM gets rescheduled
|
||||
without the old one being fenced), then there is a risk of corruption, when
|
||||
the control plane attaches the tenant, as follows:
|
||||
|
||||
- If the control plane sends an `/attach` request to node A, then node A dies
|
||||
and is replaced, and the control plane's retries the request without
|
||||
incrementing that attachment ID, then it could end up with two physical nodes
|
||||
both using the same generation number.
|
||||
- This is not an issue when using EC2 instances with ephemeral storage, as long
|
||||
as the control plane never re-uses a node ID, but it would need re-examining
|
||||
if running on different infrastructure.
|
||||
- To robustly protect against this class of issue, we would either:
|
||||
- add a "node generation" to distinguish between different processes holding the
|
||||
same node_id.
|
||||
- or, dispense with static node_id entirely and issue an ephemeral ID to each
|
||||
pageserver process when it starts.
|
||||
|
||||
## Implementation Part 2: Optimizations
|
||||
|
||||
### Persistent deletion queue
|
||||
|
||||
Between writing our a new index_part.json that doesn't reference an object,
|
||||
and executing the deletion, an object passes through a window where it is
|
||||
only referenced in memory, and could be leaked if the pageserver is stopped
|
||||
uncleanly. That introduces conflicting incentives: on the one hand, we would
|
||||
like to delay and batch deletions to
|
||||
1. minimize the cost of the mandatory validations calls to control plane, and
|
||||
2. minimize cost for DeleteObjects requests.
|
||||
On the other hand we would also like to minimize leakage by executing
|
||||
deletions promptly.
|
||||
|
||||
To resolve this, we may make the deletion queue persistent
|
||||
and then executing these in the background at a later time.
|
||||
|
||||
_Note: The deletion queue's reason for existence is optimization rather than correctness,
|
||||
so there is a lot of flexibility in exactly how the it should work,
|
||||
as long as it obeys the rule to validate generations before executing deletions,
|
||||
so the following details are not essential to the overall RFC._
|
||||
|
||||
#### Scope
|
||||
|
||||
The deletion queue will be global per pageserver, not per-tenant. There
|
||||
are several reasons for this choice:
|
||||
|
||||
- Use the queue as a central point to coalesce validation requests to the
|
||||
control plane: this avoids individual `Timeline` objects ever touching
|
||||
the control plane API, and avoids them having to know the rules about
|
||||
validating deletions. This separation of concerns will avoid burdening
|
||||
the already many-LoC `Timeline` type with even more responsibility.
|
||||
- Decouple the deletion queue from Tenant attachment lifetime: we may
|
||||
"hibernate" an inactive tenant by tearing down its `Tenant`/`Timeline`
|
||||
objects in the pageserver, without having to wait for deletions to be done.
|
||||
- Amortize the cost of I/O for the persistent queue, instead of having many
|
||||
tiny queues.
|
||||
- Coalesce deletions into a smaller number of larger DeleteObjects calls
|
||||
|
||||
Because of the cost of doing I/O for persistence, and the desire to coalesce
|
||||
generation validation requests across tenants, and coalesce deletions into
|
||||
larger DeleteObjects requests, there will be one deletion queue per pageserver
|
||||
rather than one per tenant. This has the added benefit that when deactivating
|
||||
a tenant, we do not have to drain their deletion queue: deletions can proceed
|
||||
for a tenant whose main `Tenant` object has been torn down.
|
||||
|
||||
#### Flow of deletion
|
||||
|
||||
The flow of a deletion is becomes:
|
||||
|
||||
1. Need for deletion of an object (=> layer file) is identified.
|
||||
2. Unlink the object from all the places that reference it (=> `index_part.json`).
|
||||
3. Enqueue the deletion to a persistent queue.
|
||||
Each entry is `tenant_id, attachment_generation, S3 key`.
|
||||
4. Validate & execute in batches:
|
||||
4.1 For a batch of entries, call into control plane.
|
||||
4.2 For the subset of entries that passed validation, execute a `DeleteObjects` S3 DELETE request for their S3 keys.
|
||||
|
||||
As outlined in the Part 1 on correctness, it is critical that deletions are only
|
||||
executed once the key is not referenced anywhere in S3.
|
||||
This property is obviously upheld by the scheme above.
|
||||
|
||||
#### We Accept Object Leakage In Acceptable Circumcstances
|
||||
|
||||
If we crash in the flow above between (2) and (3), we lose track of unreferenced object.
|
||||
Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk.
|
||||
This is acceptable for now, it can be caught by [the scrubber](#cleaning-up-orphan-objects-scrubbing).
|
||||
|
||||
There are various measures we can take to improve this in the future.
|
||||
1. Cap amount of time until enqueued entry becomes durable (timeout for flush-to-tisk)
|
||||
2. Proactively flush:
|
||||
- On graceful shutdown, as we anticipate that some or
|
||||
all of our attachments may be re-assigned while we are offline.
|
||||
- On tenant detach.
|
||||
3. For each entry, keep track of whether it has passed (2).
|
||||
Only admit entries to (4) one they have passed (2).
|
||||
This requires re-writing / two queue entries (intent, commit) per deletion.
|
||||
|
||||
The important take-away with any of the above is that it's not
|
||||
disastrous to leak objects in exceptional circumstances.
|
||||
|
||||
#### Operations that may skip the queue
|
||||
|
||||
Deletions of an entire timeline are [exempt](#Timeline-Deletion) from generation number validation. Once the
|
||||
control plane sends the deletion request, there is no requirement to retain the readability
|
||||
of any data within the timeline, and all objects within the timeline path may be deleted
|
||||
at any time from the control plane's deletion request onwards.
|
||||
|
||||
Since deletions of smaller timelines won't have enough objects to compose a full sized
|
||||
DeleteObjects request, it is still useful to send these through the last part of the
|
||||
deletion pipeline to coalesce with other executing deletions: to enable this, the
|
||||
deletion queue should expose two input channels: one for deletions that must be
|
||||
processed in a generation-aware way, and a fast path for timeline deletions, where
|
||||
that fast path may skip validation and the persistent queue.
|
||||
|
||||
### Cleaning up orphan objects (scrubbing)
|
||||
|
||||
An orphan object is any object which is no longer referenced by a running node or by metadata.
|
||||
|
||||
Examples of how orphan objects arise:
|
||||
|
||||
- A node PUTs a layer object, then crashes before it writes the
|
||||
index_part.json that references that layer.
|
||||
- A stale node carries on running for some time, and writes out an unbounded number of
|
||||
objects while it believes itself to be the rightful writer for a tenant.
|
||||
- A pageserver crashes between un-linking an object from the index, and persisting
|
||||
the object to its deletion queue.
|
||||
|
||||
Orphan objects are functionally harmless, but have a small cost due to S3 capacity consumed. We
|
||||
may clean them up at some time in the future, but doing a ListObjectsv2 operation and cross
|
||||
referencing with the latest metadata to identify objects which are not referenced.
|
||||
|
||||
Scrubbing will be done only by an attached pageserver (not some third party process), and deletions requested during scrub will go through the same
|
||||
validation as all other deletions: the attachment generation must be
|
||||
fresh. This avoids the possibility of a stale pageserver incorrectly
|
||||
thinking than an object written by a newer generation is stale, and deleting
|
||||
it.
|
||||
|
||||
It is not strictly necessary that scrubbing be done by an attached
|
||||
pageserver: it could also be done externally. However, an external
|
||||
scrubber would still require the same validation procedure that
|
||||
a pageserver's deletion queue performs, before actually erasing
|
||||
objects.
|
||||
|
||||
## Operational impact
|
||||
|
||||
### Availability
|
||||
|
||||
Coordination of generation numbers via the control plane introduce a dependency for certain
|
||||
operations:
|
||||
|
||||
1. Starting new pageservers (or activating pageservers after a restart)
|
||||
2. Executing enqueued deletions
|
||||
3. Advertising updated `remote_consistent_lsn` to enable WAL trimming
|
||||
|
||||
Item 1. would mean that some in-place restarts that previously would have resumed service even if the control plane were
|
||||
unavailable, will now not resume service to users until the control plane is available. We could
|
||||
avoid this by having a timeout on communication with the control plane, and after some timeout,
|
||||
resume service with the previous generation numbers (assuming this was persisted to disk). However,
|
||||
this is unlikely to be needed as the control plane is already an essential & highly available component. Also, having a node re-use an old generation number would complicate
|
||||
reasoning about the system, as it would break the invariant that a generation number uniquely identifies
|
||||
a tenant's attachment to a given pageserver _process_: it would merely identify the tenant's attachment
|
||||
to the pageserver _machine_ or its _on-disk-state_.
|
||||
|
||||
Item 2. is a non-issue operationally: it's harmless to delay deletions, the only impact of objects pending deletion is
|
||||
the S3 capacity cost.
|
||||
|
||||
Item 3. could be an issue if safekeepers are low on disk space and the control plane is unavailable for a long time. If this became an issue,
|
||||
we could adjust the safekeeper to delete segments from local disk sooner, as soon as they're uploaded to S3, rather than waiting for
|
||||
remote_consistent_lsn to advance.
|
||||
|
||||
For a managed service, the general approach should be to make sure we are monitoring & respond fast enough
|
||||
that control plane outages are bounded in time.
|
||||
|
||||
There is also the fact that control plane runs in a single region.
|
||||
The latency for distant regions is not a big concern for us because all request types added by this RFC are either infrequent or not in the way of the data path.
|
||||
However, we lose region isolation for the operations listed above.
|
||||
The ongoing work to split console and control will give us per-region control plane, and all operations in this RFC can be handled by these per-region control planes.
|
||||
With that in mind, we accept the trade-offs outlined in this paragraph.
|
||||
|
||||
We will also implement an "escape hatch" config generation numbers, where in a major disaster outage,
|
||||
we may manually run pageservers with a hand-selected generation number, so that we can bring them online
|
||||
independently of a control plane.
|
||||
|
||||
### Rollout
|
||||
|
||||
Although there is coupling between components, we may deploy most of the new data plane components
|
||||
independently of the control plane: initially they can just use a static generation number.
|
||||
|
||||
#### Phase 1
|
||||
|
||||
The pageserver is deployed with some special config to:
|
||||
|
||||
- Always act like everything is generation 1 and do not wait for a control plane issued generation on attach
|
||||
- Skip the places in deletion and remote_consistent_lsn updates where we would call into control plane
|
||||
|
||||
#### Phase 2
|
||||
|
||||
The control plane changes are deployed: control plane will now track and increment generation numbers.
|
||||
|
||||
#### Phase 3
|
||||
|
||||
The pageserver is deployed with its control-plane-dependent changes enabled: it will now require
|
||||
the control plane to service re-attach requests on startup, and handle generation
|
||||
validation requests.
|
||||
|
||||
### On-disk backward compatibility
|
||||
|
||||
Backward compatibility with existing data is straightforward:
|
||||
|
||||
- When reading the index, we may assume that any layer whose metadata doesn't include
|
||||
generations will have a path without generation suffix.
|
||||
- When locating the index file on attachment, we may use the "fallback" listing path
|
||||
and if there is only an index without generation suffix, that is the one we load.
|
||||
|
||||
It is not necessary to re-write existing layers: even new index files will be able
|
||||
to represent generation-less layers.
|
||||
|
||||
### On-disk forward compatibility
|
||||
|
||||
We will do a two phase rollout, probably over multiple releases because we will naturally
|
||||
have some of the read-side code ready before the overall functionality is ready:
|
||||
|
||||
1. Deploy pageservers which understand the new index format and generation suffixes
|
||||
in keys, but do not write objects with generation numbers in the keys.
|
||||
2. Deploy pageservers that write objects with generation numbers in the keys.
|
||||
|
||||
Old pageservers will be oblivious to generation numbers. That means that they can't
|
||||
read objects with generation numbers in the name. This is why we must
|
||||
first step must deploy the ability to read, before the second step
|
||||
starts writing them.
|
||||
|
||||
# Frequently Asked Questions
|
||||
|
||||
## Why a generation _suffix_ rather than _prefix_?
|
||||
|
||||
The choice is motivated by object listing, since one can list by prefix but not
|
||||
suffix.
|
||||
|
||||
In [finding remote indices](#finding-the-remote-indices-for-timelines), we rely
|
||||
on being able to do a prefix listing for `<tenant>/<timeline>/index_part.json*`.
|
||||
That relies on the prefix listing.
|
||||
|
||||
The converse case of using a generation prefix and listing by generation is
|
||||
not needed: one could imagine listing by generation while scrubbing (so that
|
||||
a particular generation's layers could be scrubbed), but this is not part
|
||||
of normal operations, and the [scrubber](#cleaning-up-orphan-objects-scrubbing) probably won't work that way anyway.
|
||||
|
||||
## Wouldn't it be simpler to have a separate deletion queue per timeline?
|
||||
|
||||
Functionally speaking, we could. That's how RemoteTimelineClient currently works,
|
||||
but this approach does not map well to a long-lived persistent queue with
|
||||
generation validation.
|
||||
|
||||
Anything we do per-timeline generates tiny random I/O, on a pageserver with
|
||||
tens of thousands of timelines operating: to be ready for high scale, we should:
|
||||
|
||||
- A) Amortize costs where we can (e.g. a shared deletion queue)
|
||||
- B) Expect to put tenants into a quiescent state while they're not
|
||||
busy: i.e. we shouldn't keep a tenant alive to service its deletion queue.
|
||||
|
||||
This was discussed in the [scope](#scope) part of the deletion queue section.
|
||||
|
||||
# Appendix A: Examples of use in high availability/failover
|
||||
|
||||
The generation numbers proposed in this RFC are adaptable to a variety of different
|
||||
failover scenarios and models. The sections below sketch how they would work in practice.
|
||||
|
||||
### In-place restart of a pageserver
|
||||
|
||||
"In-place" here means that the restart is done before any other element in the system
|
||||
has taken action in response to the node being down.
|
||||
|
||||
- After restart, the node issues a re-attach request to the control plane, and
|
||||
receives new generation numbers for all its attached tenants.
|
||||
- Tenants may be activated with the generation number in the re-attach response.
|
||||
- If any of its attachments were in fact stale (i.e. had be reassigned to another
|
||||
node while this node was offline), then
|
||||
- the re-attach response will inform the tenant about this by not including
|
||||
the tenant of this by _not_ incrementing the generation for that attachment.
|
||||
- This will implicitly block deletions in the tenant, but as an optimization
|
||||
the pageserver should also proactively stop doing S3 uploads when it notices this stale-generation state.
|
||||
- The control plane is expected to eventually detach this tenant from the
|
||||
pageserver.
|
||||
|
||||
If the control plane does not include a tenant in the re-attach response,
|
||||
but there is still local state for the tenant in the filesystem, the pageserver
|
||||
deletes the local state in response and does not load/active the tenant.
|
||||
See the [earlier section on pageserver startup](#pageserver-attachstartup-changes) for details.
|
||||
Control plane can use this mechanism to clean up a pageserver that has been
|
||||
down for so long that all its tenants were migrated away before it came back
|
||||
up again and asked for re-attach.
|
||||
|
||||
### Failure of a pageserver
|
||||
|
||||
In this context, read "failure" as the most ambiguous possible case, where
|
||||
a pageserver is unavailable to clients and control plane, but may still be executing and talking
|
||||
to S3.
|
||||
|
||||
#### Case A: re-attachment to other nodes
|
||||
|
||||
1. Let's say node 0 becomes unresponsive in a cluster of three nodes 0, 1, 2.
|
||||
2. Some external mechanism notices that the node is unavailable and initiates
|
||||
movement of all tenants attached to that node to a different node according
|
||||
to some distribution rule.
|
||||
In this example, it would mean incrementing the generation
|
||||
of all tenants that were attached to node 0, as each tenant's assigned pageserver changes.
|
||||
3. A tenant which is now attached to node 1 will _also_ still be attached to node
|
||||
0, from the perspective of node 0. Node 0 will still be using its old generation,
|
||||
node 1 will be using a newer generation.
|
||||
4. S3 writes will continue from nodes 0 and 1: there will be an index_part.json-00000001
|
||||
\_and\* an index_part.json-00000002. Objects written under the old suffix
|
||||
after the new attachment was created do not matter from the rest of the system's
|
||||
perspective: the endpoints are reading from the new attachment location. Objects
|
||||
written by node 0 are just garbage that can be cleaned up at leisure. Node 0 will
|
||||
not do any deletions because it can't synchronize with control plane, or if it could,
|
||||
its deletion queue processing would get errors for the validation requests.
|
||||
|
||||
#### Case B: direct node replacement with same node_id and drive
|
||||
|
||||
This is the scenario we would experience if running pageservers in some dynamic
|
||||
VM/container environment that would auto-replace a given node_id when it became
|
||||
unresponsive, with the node's storage supplied by some network block device
|
||||
that is attached to the replacement VM/container.
|
||||
|
||||
1. Let's say node 0 fails, and there may be some other peers but they aren't relevant.
|
||||
2. Some external mechanism notices that the node is unavailable, and creates
|
||||
a "new node 0" (Node 0b) which is a physically separate server. The original node 0
|
||||
(Node 0a) may still be running, because we do not assume the environment fences nodes.
|
||||
3. On startup, node 0b re-attaches and gets higher generation numbers for
|
||||
all tenants.
|
||||
4. S3 writes continue from nodes 0a and 0b, but the writes do not collide due to different
|
||||
generation in the suffix, and the writes from node 0a are not visible to the rest
|
||||
of the system because endpoints are reading only from node 0b.
|
||||
|
||||
# Appendix B: interoperability with other features
|
||||
|
||||
## Sharded Keyspace
|
||||
|
||||
The design in this RFC maps neatly to a sharded keyspace design where subsets of the key space
|
||||
for a tenant are assigned to different pageservers:
|
||||
|
||||
- the "unit of work" for attachments becomes something like a TenantShard rather than a Tenant
|
||||
- TenantShards get generation numbers just as Tenants do.
|
||||
- Write workload (ingest, compaction) for a tenant is spread out across pageservers via
|
||||
TenantShards, but each TenantShard still has exactly one valid writer at a time.
|
||||
|
||||
## Read replicas
|
||||
|
||||
_This section is about a passive reader of S3 pageserver state, not a postgres
|
||||
read replica_
|
||||
|
||||
For historical reads to LSNs below the remote persistent LSN, any node may act as a reader at any
|
||||
time: remote data is logically immutable data, and the use of deferred deletion in this RFC helps
|
||||
mitigate the fact that remote data is not _physically_ immutable (i.e. the actual data for a given
|
||||
page moves around as compaction happens).
|
||||
|
||||
A read replica needs to be aware of generations in remote data in order to read the latest
|
||||
metadata (find the index_part.json with the latest suffix). It may either query this
|
||||
from the control plane, or find it with ListObjectsv2 request
|
||||
|
||||
## Seamless migration
|
||||
|
||||
To make tenant migration totally seamless, we will probably want to intentionally double-attach
|
||||
a tenant briefly, serving reads from the old node while waiting for the new node to be ready.
|
||||
|
||||
This RFC enables that double-attachment: two nodes may be attached at the same time, with the migration destination
|
||||
having a higher generation number. The old node will be able to ingest and serve reads, but not
|
||||
do any deletes. The new node's attachment must also avoid deleting layers that the old node may
|
||||
still use. A new piece of state
|
||||
will be needed for this in the control plane's definition of an attachment.
|
||||
|
||||
## Warm secondary locations
|
||||
|
||||
To enable faster tenant movement after a pageserver is lost, we will probably want to spend some
|
||||
disk capacity on keeping standby locations populated with local disk data.
|
||||
|
||||
There's no conflict between this RFC and that: implementing warm secondary locations on a per-tenant basis
|
||||
would be a separate change to the control plane to store standby location(s) for a tenant. Because
|
||||
the standbys do not write to S3, they do not need to be assigned generation numbers. When a tenant is
|
||||
re-attached to a standby location, that would increment the tenant attachment generation and this
|
||||
would work the same as any other attachment change, but with a warm cache.
|
||||
|
||||
## Ephemeral node IDs
|
||||
|
||||
This RFC intentionally avoids changing anything fundamental about how pageservers are identified
|
||||
and registered with the control plane, to avoid coupling the implementation of pageserver split
|
||||
brain protection with more fundamental changes in the management of the pageservers.
|
||||
|
||||
Moving to ephemeral node IDs would provide an extra layer of
|
||||
resilience in the system, as it would prevent the control plane
|
||||
accidentally attaching to two physical nodes with the same
|
||||
generation, if somehow there were two physical nodes with
|
||||
the same node IDs (currently we rely on EC2 guarantees to
|
||||
eliminate this scenario). With ephemeral node IDs, there would be
|
||||
no possibility of that happening, no matter the behavior of
|
||||
underlying infrastructure.
|
||||
|
||||
Nothing fundamental in the pageserver's handling of generations needs to change to handle ephemeral node IDs, since we hardly use the
|
||||
`node_id` anywhere. The `/re-attach` API would be extended
|
||||
to enable the pageserver to obtain its ephemeral ID, and provide
|
||||
some correlation identifier (e.g. EC instance ID), to help the
|
||||
control plane re-attach tenants to the same physical server that
|
||||
previously had them attached.
|
||||
@@ -12,7 +12,6 @@ const_format.workspace = true
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
byteorder.workspace = true
|
||||
hex.workspace = true
|
||||
utils.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
enum-map.workspace = true
|
||||
|
||||
@@ -1,89 +0,0 @@
|
||||
/// Types in this file are for pageserver's upward-facing API calls to the control plane
|
||||
use hex::FromHex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
/// TenantId's serialization is an array of u8, which is rather unfriendly
|
||||
/// for outside callers who aren't working with the native Rust TenantId.
|
||||
/// This class wraps it in serialization that is just the hex strict
|
||||
/// representation.
|
||||
#[derive(Eq, PartialEq, Clone, Hash)]
|
||||
pub struct HexTenantId(TenantId);
|
||||
|
||||
impl HexTenantId {
|
||||
pub fn new(t: TenantId) -> Self {
|
||||
Self(t)
|
||||
}
|
||||
|
||||
pub fn take(self) -> TenantId {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<TenantId> for HexTenantId {
|
||||
fn as_ref(&self) -> &TenantId {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for HexTenantId {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
let hex = self.0.hex_encode();
|
||||
serializer.collect_str(&hex)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for HexTenantId {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let string = String::deserialize(deserializer)?;
|
||||
TenantId::from_hex(string)
|
||||
.map(|t| HexTenantId::new(t))
|
||||
.map_err(|e| serde::de::Error::custom(format!("{e}")))
|
||||
}
|
||||
}
|
||||
|
||||
// Top level s
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachRequest {
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponseTenant {
|
||||
pub id: HexTenantId,
|
||||
pub generation: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponse {
|
||||
pub tenants: Vec<ReAttachResponseTenant>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateRequestTenant {
|
||||
pub id: HexTenantId,
|
||||
pub gen: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateRequest {
|
||||
pub tenants: Vec<ValidateRequestTenant>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateResponse {
|
||||
pub tenants: Vec<ValidateResponseTenant>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateResponseTenant {
|
||||
pub id: HexTenantId,
|
||||
pub valid: bool,
|
||||
}
|
||||
@@ -1,7 +1,6 @@
|
||||
use const_format::formatcp;
|
||||
|
||||
/// Public API types
|
||||
pub mod control_api;
|
||||
pub mod models;
|
||||
pub mod reltag;
|
||||
|
||||
|
||||
@@ -194,9 +194,6 @@ pub struct TimelineCreateRequest {
|
||||
pub struct TenantCreateRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub new_tenant_id: TenantId,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
#[serde(flatten)]
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
@@ -244,6 +241,15 @@ pub struct StatusResponse {
|
||||
pub id: NodeId,
|
||||
}
|
||||
|
||||
impl TenantCreateRequest {
|
||||
pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest {
|
||||
TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
config: TenantConfig::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
@@ -287,11 +293,9 @@ impl TenantConfigRequest {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TenantAttachRequest {
|
||||
pub config: TenantAttachConfig,
|
||||
#[serde(default)]
|
||||
pub generation: Option<u32>,
|
||||
}
|
||||
|
||||
/// Newtype to enforce deny_unknown_fields on TenantConfig for
|
||||
|
||||
@@ -13,14 +13,13 @@ use std::{
|
||||
collections::HashMap,
|
||||
fmt::Debug,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
path::{Path, PathBuf, StripPrefixError},
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io;
|
||||
use toml_edit::Item;
|
||||
use tracing::info;
|
||||
@@ -45,34 +44,12 @@ pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
|
||||
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
|
||||
// From the S3 spec
|
||||
pub const MAX_KEYS_PER_DELETE: usize = 1000;
|
||||
|
||||
/// Path on the remote storage, relative to some inner prefix.
|
||||
/// The prefix is an implementation detail, that allows representing local paths
|
||||
/// as the remote ones, stripping the local storage prefix away.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct RemotePath(PathBuf);
|
||||
|
||||
impl Serialize for RemotePath {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for RemotePath {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let str = String::deserialize(deserializer)?;
|
||||
Ok(Self(PathBuf::from(&str)))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for RemotePath {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0.display())
|
||||
@@ -111,15 +88,6 @@ impl RemotePath {
|
||||
pub fn extension(&self) -> Option<&str> {
|
||||
self.0.extension()?.to_str()
|
||||
}
|
||||
|
||||
/// Unwrap the PathBuf that RemotePath wraps
|
||||
pub fn take(self) -> PathBuf {
|
||||
self.0
|
||||
}
|
||||
|
||||
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, StripPrefixError> {
|
||||
self.0.strip_prefix(&p.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
@@ -198,8 +166,6 @@ pub enum DownloadError {
|
||||
BadInput(anyhow::Error),
|
||||
/// The file was not found in the remote storage.
|
||||
NotFound,
|
||||
/// The client was shut down
|
||||
Shutdown,
|
||||
/// The file was found in the remote storage, but the download failed.
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
@@ -211,7 +177,6 @@ impl std::fmt::Display for DownloadError {
|
||||
write!(f, "Failed to download a remote file due to user input: {e}")
|
||||
}
|
||||
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
|
||||
DownloadError::Shutdown => write!(f, "Client shutting down"),
|
||||
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
|
||||
}
|
||||
}
|
||||
@@ -276,18 +241,6 @@ impl GenericRemoteStorage {
|
||||
}
|
||||
}
|
||||
|
||||
/// For small, simple downloads where caller doesn't want to handle the streaming: return the full body
|
||||
pub async fn download_all(&self, from: &RemotePath) -> Result<Vec<u8>, DownloadError> {
|
||||
let mut download = self.download(from).await?;
|
||||
|
||||
let mut bytes = Vec::new();
|
||||
tokio::io::copy(&mut download.download_stream, &mut bytes)
|
||||
.await
|
||||
.with_context(|| format!("Failed to download body from {from}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
Ok(bytes)
|
||||
}
|
||||
|
||||
pub async fn download_byte_range(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
|
||||
@@ -148,53 +148,21 @@ impl RemoteStorage for LocalFs {
|
||||
Some(folder) => folder.with_base(&self.storage_root),
|
||||
None => self.storage_root.clone(),
|
||||
};
|
||||
|
||||
// If we were given a directory, we may use it as our starting point.
|
||||
// Otherwise, we must go up to the parent directory. This is because
|
||||
// S3 object list prefixes can be arbitrary strings, but when reading
|
||||
// the local filesystem we need a directory to start calling read_dir on.
|
||||
let mut initial_dir = full_path.clone();
|
||||
match fs::metadata(full_path.clone()).await {
|
||||
Err(e) => {
|
||||
// It's not a file that exists: strip the prefix back to the parent directory
|
||||
if matches!(e.kind(), ErrorKind::NotFound) {
|
||||
initial_dir.pop();
|
||||
}
|
||||
}
|
||||
Ok(meta) => {
|
||||
if !meta.is_dir() {
|
||||
// It's not a directory: strip back to the parent
|
||||
initial_dir.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Note that PathBuf starts_with only considers full path segments, but
|
||||
// object prefixes are arbitrary strings, so we need the strings for doing
|
||||
// starts_with later.
|
||||
let prefix = full_path.to_string_lossy();
|
||||
|
||||
let mut files = vec![];
|
||||
let mut directory_queue = vec![initial_dir.clone()];
|
||||
let mut directory_queue = vec![full_path.clone()];
|
||||
|
||||
while let Some(cur_folder) = directory_queue.pop() {
|
||||
let mut entries = fs::read_dir(cur_folder.clone()).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let file_name: PathBuf = entry.file_name().into();
|
||||
let full_file_name = cur_folder.clone().join(&file_name);
|
||||
if full_file_name
|
||||
.to_str()
|
||||
.map(|s| s.starts_with(prefix.as_ref()))
|
||||
.unwrap_or(false)
|
||||
{
|
||||
let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
|
||||
files.push(file_remote_path.clone());
|
||||
if full_file_name.is_dir() {
|
||||
directory_queue.push(full_file_name);
|
||||
}
|
||||
let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
|
||||
files.push(file_remote_path.clone());
|
||||
if full_file_name.is_dir() {
|
||||
directory_queue.push(full_file_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ use aws_sdk_s3::{
|
||||
Client,
|
||||
};
|
||||
use aws_smithy_http::body::SdkBody;
|
||||
use hyper::{Body, StatusCode};
|
||||
use hyper::Body;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio::{
|
||||
io::{self, AsyncRead},
|
||||
@@ -529,16 +529,7 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if let Some(r) = e.raw_response() {
|
||||
if r.http().status() == StatusCode::NOT_FOUND {
|
||||
// 404 is acceptable for deletions. AWS S3 does not return this, but
|
||||
// some other implementations might (e.g. GCS XML API returns 404 on DeleteObject
|
||||
// to a missing key)
|
||||
continue;
|
||||
} else {
|
||||
return Err(anyhow::format_err!("DeleteObjects response error: {e}"));
|
||||
}
|
||||
}
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,8 +31,6 @@ fn lsn_invalid() -> Lsn {
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct SkTimelineInfo {
|
||||
/// Term.
|
||||
pub term: Option<u64>,
|
||||
/// Term of the last entry.
|
||||
pub last_log_term: Option<u64>,
|
||||
/// LSN of the last record.
|
||||
@@ -60,6 +58,4 @@ pub struct SkTimelineInfo {
|
||||
/// A connection string to use for WAL receiving.
|
||||
#[serde(default)]
|
||||
pub safekeeper_connstr: Option<String>,
|
||||
#[serde(default)]
|
||||
pub http_connstr: Option<String>,
|
||||
}
|
||||
|
||||
@@ -26,7 +26,6 @@ serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
||||
@@ -38,7 +37,6 @@ url.workspace = true
|
||||
uuid.workspace = true
|
||||
|
||||
pq_proto.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
metrics.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
|
||||
@@ -1,31 +1,18 @@
|
||||
use std::fmt::{Debug, Display};
|
||||
|
||||
use futures::Future;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
|
||||
pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
|
||||
|
||||
pub async fn exponential_backoff(
|
||||
n: u32,
|
||||
base_increment: f64,
|
||||
max_seconds: f64,
|
||||
cancel: &CancellationToken,
|
||||
) {
|
||||
pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
|
||||
let backoff_duration_seconds =
|
||||
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
|
||||
if backoff_duration_seconds > 0.0 {
|
||||
tracing::info!(
|
||||
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
|
||||
);
|
||||
|
||||
drop(
|
||||
tokio::time::timeout(
|
||||
std::time::Duration::from_secs_f64(backoff_duration_seconds),
|
||||
cancel.cancelled(),
|
||||
)
|
||||
.await,
|
||||
)
|
||||
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,57 +24,28 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
|
||||
}
|
||||
}
|
||||
|
||||
/// Configure cancellation for a retried operation: when to cancel (the token), and
|
||||
/// what kind of error to return on cancellation
|
||||
pub struct Cancel<E, CF>
|
||||
where
|
||||
E: Display + Debug + 'static,
|
||||
CF: Fn() -> E,
|
||||
{
|
||||
token: CancellationToken,
|
||||
on_cancel: CF,
|
||||
}
|
||||
|
||||
impl<E, CF> Cancel<E, CF>
|
||||
where
|
||||
E: Display + Debug + 'static,
|
||||
CF: Fn() -> E,
|
||||
{
|
||||
pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
|
||||
Self { token, on_cancel }
|
||||
}
|
||||
}
|
||||
|
||||
/// retries passed operation until one of the following conditions are met:
|
||||
/// Encountered error is considered as permanent (non-retryable)
|
||||
/// Retries have been exhausted.
|
||||
/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
|
||||
/// When attempts cross `warn_threshold` function starts to emit log warnings.
|
||||
/// `description` argument is added to log messages. Its value should identify the `op` is doing
|
||||
/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
|
||||
/// to drop out promptly on shutdown.
|
||||
pub async fn retry<T, O, F, E, CF>(
|
||||
pub async fn retry<T, O, F, E>(
|
||||
mut op: O,
|
||||
is_permanent: impl Fn(&E) -> bool,
|
||||
warn_threshold: u32,
|
||||
max_retries: u32,
|
||||
description: &str,
|
||||
cancel: Cancel<E, CF>,
|
||||
) -> Result<T, E>
|
||||
where
|
||||
// Not std::error::Error because anyhow::Error doesnt implement it.
|
||||
// For context see https://github.com/dtolnay/anyhow/issues/63
|
||||
E: Display + Debug + 'static,
|
||||
E: Display + Debug,
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
CF: Fn() -> E,
|
||||
{
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
if cancel.token.is_cancelled() {
|
||||
return Err((cancel.on_cancel)());
|
||||
}
|
||||
|
||||
let result = op().await;
|
||||
match result {
|
||||
Ok(_) => {
|
||||
@@ -122,7 +80,6 @@ where
|
||||
attempts,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
&cancel.token,
|
||||
)
|
||||
.await;
|
||||
attempts += 1;
|
||||
@@ -175,7 +132,6 @@ mod tests {
|
||||
1,
|
||||
1,
|
||||
"work",
|
||||
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -201,7 +157,6 @@ mod tests {
|
||||
2,
|
||||
2,
|
||||
"work",
|
||||
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -224,7 +179,6 @@ mod tests {
|
||||
2,
|
||||
2,
|
||||
"work",
|
||||
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
@@ -1,121 +0,0 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
|
||||
pub enum Generation {
|
||||
// Generations with this magic value will not add a suffix to S3 keys, and will not
|
||||
// be included in persisted index_part.json. This value is only to be used
|
||||
// during migration from pre-generation metadata to generation-aware metadata,
|
||||
// and should eventually go away.
|
||||
//
|
||||
// A special Generation is used rather than always wrapping Generation in an Option,
|
||||
// so that code handling generations doesn't have to be aware of the legacy
|
||||
// case everywhere it touches a generation.
|
||||
None,
|
||||
// Generations with this magic value may never be used to construct S3 keys:
|
||||
// we will panic if someone tries to. This is for Tenants in the "Broken" state,
|
||||
// so that we can satisfy their constructor with a Generation without risking
|
||||
// a code bug using it in an S3 write (broken tenants should never write)
|
||||
Broken,
|
||||
Valid(u32),
|
||||
}
|
||||
|
||||
/// The Generation type represents a number associated with a Tenant, which
|
||||
/// increments every time the tenant is attached to a new pageserver, or
|
||||
/// an attached pageserver restarts.
|
||||
///
|
||||
/// It is included as a suffix in S3 keys, as a protection against split-brain
|
||||
/// scenarios where pageservers might otherwise issue conflicting writes to
|
||||
/// remote storage
|
||||
impl Generation {
|
||||
/// Create a new Generation that represents a legacy key format with
|
||||
/// no generation suffix
|
||||
pub fn none() -> Self {
|
||||
Self::None
|
||||
}
|
||||
|
||||
// Create a new generation that will panic if you try to use get_suffix
|
||||
pub fn broken() -> Self {
|
||||
Self::Broken
|
||||
}
|
||||
|
||||
pub fn new(v: u32) -> Self {
|
||||
Self::Valid(v)
|
||||
}
|
||||
|
||||
pub fn is_none(&self) -> bool {
|
||||
matches!(self, Self::None)
|
||||
}
|
||||
|
||||
pub fn get_suffix(&self) -> String {
|
||||
match self {
|
||||
Self::Valid(v) => {
|
||||
format!("-{:08x}", v)
|
||||
}
|
||||
Self::None => "".into(),
|
||||
Self::Broken => {
|
||||
panic!("Tried to use a broken generation");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn previous(&self) -> Self {
|
||||
if let Self::Valid(v) = self {
|
||||
Self::new(v - 1)
|
||||
} else {
|
||||
Self::none()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into(self) -> Option<u32> {
|
||||
if let Self::Valid(v) = self {
|
||||
Some(v)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Generation {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
if let Self::Valid(v) = self {
|
||||
v.serialize(serializer)
|
||||
} else {
|
||||
// We should never be asked to serialize a None or Broken. Structures
|
||||
// that include an optional generation should convert None to an
|
||||
// Option<Generation>::None
|
||||
Err(serde::ser::Error::custom(
|
||||
"Tried to serialize invalid generation",
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Generation {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
Ok(Self::Valid(u32::deserialize(deserializer)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Generation {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Valid(v) => {
|
||||
write!(f, "{:08x}", v)
|
||||
}
|
||||
Self::None => {
|
||||
write!(f, "<none>")
|
||||
}
|
||||
Self::Broken => {
|
||||
write!(f, "<broken>")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -24,9 +24,6 @@ pub enum ApiError {
|
||||
#[error("Precondition failed: {0}")]
|
||||
PreconditionFailed(Box<str>),
|
||||
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown,
|
||||
|
||||
#[error(transparent)]
|
||||
InternalServerError(anyhow::Error),
|
||||
}
|
||||
@@ -55,10 +52,6 @@ impl ApiError {
|
||||
self.to_string(),
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
),
|
||||
ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
|
||||
"Shutting down".to_string(),
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
|
||||
@@ -50,7 +50,7 @@ impl Id {
|
||||
Id::from(tli_buf)
|
||||
}
|
||||
|
||||
pub fn hex_encode(&self) -> String {
|
||||
fn hex_encode(&self) -> String {
|
||||
static HEX: &[u8] = b"0123456789abcdef";
|
||||
|
||||
let mut buf = vec![0u8; self.0.len() * 2];
|
||||
@@ -133,10 +133,6 @@ macro_rules! id_newtype {
|
||||
pub const fn from_array(b: [u8; 16]) -> Self {
|
||||
$t(Id(b))
|
||||
}
|
||||
|
||||
pub fn hex_encode(&self) -> String {
|
||||
self.0.hex_encode()
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for $t {
|
||||
@@ -248,13 +244,13 @@ id_newtype!(TenantId);
|
||||
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
|
||||
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
|
||||
/// See [`Id`] for alternative ways to serialize it.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
|
||||
pub struct ConnectionId(Id);
|
||||
|
||||
id_newtype!(ConnectionId);
|
||||
|
||||
// A pair uniquely identifying Neon instance.
|
||||
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct TenantTimelineId {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
@@ -277,36 +273,6 @@ impl TenantTimelineId {
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for TenantTimelineId {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for TenantTimelineId {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let str = String::deserialize(deserializer)?;
|
||||
if let Some((tenant_part, timeline_part)) = str.split_once('/') {
|
||||
Ok(Self {
|
||||
tenant_id: TenantId(Id::from_hex(tenant_part).map_err(|e| {
|
||||
serde::de::Error::custom(format!("Malformed tenant in TenantTimelineId: {e}"))
|
||||
})?),
|
||||
timeline_id: TimelineId(Id::from_hex(timeline_part).map_err(|e| {
|
||||
serde::de::Error::custom(format!("Malformed timeline in TenantTimelineId {e}"))
|
||||
})?),
|
||||
})
|
||||
} else {
|
||||
Err(serde::de::Error::custom("Malformed TenantTimelineId"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for TenantTimelineId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{}/{}", self.tenant_id, self.timeline_id)
|
||||
|
||||
@@ -27,9 +27,6 @@ pub mod id;
|
||||
// http endpoint utils
|
||||
pub mod http;
|
||||
|
||||
// definition of the Generation type for pageserver attachment APIs
|
||||
pub mod generation;
|
||||
|
||||
// common log initialisation routine
|
||||
pub mod logging;
|
||||
|
||||
@@ -61,8 +58,6 @@ pub mod serde_regex;
|
||||
|
||||
pub mod pageserver_feedback;
|
||||
|
||||
pub mod postgres_client;
|
||||
|
||||
pub mod tracing_span_assert;
|
||||
|
||||
pub mod rate_limit;
|
||||
|
||||
@@ -1,37 +0,0 @@
|
||||
//! Postgres client connection code common to other crates (safekeeper and
|
||||
//! pageserver) which depends on tenant/timeline ids and thus not fitting into
|
||||
//! postgres_connection crate.
|
||||
|
||||
use anyhow::Context;
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
|
||||
use crate::id::TenantTimelineId;
|
||||
|
||||
/// Create client config for fetching WAL from safekeeper on particular timeline.
|
||||
/// listen_pg_addr_str is in form host:\[port\].
|
||||
pub fn wal_stream_connection_config(
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: TenantTimelineId,
|
||||
listen_pg_addr_str: &str,
|
||||
auth_token: Option<&str>,
|
||||
availability_zone: Option<&str>,
|
||||
) -> anyhow::Result<PgConnectionConfig> {
|
||||
let (host, port) =
|
||||
parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
|
||||
let port = port.unwrap_or(5432);
|
||||
let mut connstr = PgConnectionConfig::new_host_port(host, port)
|
||||
.extend_options([
|
||||
"-c".to_owned(),
|
||||
format!("timeline_id={}", timeline_id),
|
||||
format!("tenant_id={}", tenant_id),
|
||||
])
|
||||
.set_password(auth_token.map(|s| s.to_owned()));
|
||||
|
||||
if let Some(availability_zone) = availability_zone {
|
||||
connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
|
||||
}
|
||||
|
||||
Ok(connstr)
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
[package]
|
||||
name = "vm_monitor"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[[bin]]
|
||||
name = "vm-monitor"
|
||||
path = "./src/bin/monitor.rs"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
axum.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
inotify.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
sysinfo.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
cgroups-rs = "0.3.3"
|
||||
@@ -1,34 +0,0 @@
|
||||
# `vm-monitor`
|
||||
|
||||
The `vm-monitor` (or just monitor) is a core component of the autoscaling system,
|
||||
along with the `autoscale-scheduler` and the `autoscaler-agent`s. The monitor has
|
||||
two primary roles: 1) notifying agents when immediate upscaling is necessary due
|
||||
to memory conditions and 2) managing Postgres' file cache and a cgroup to carry
|
||||
out upscaling and downscaling decisions.
|
||||
|
||||
## More on scaling
|
||||
|
||||
We scale CPU and memory using NeonVM, our in-house QEMU tool for use with Kubernetes.
|
||||
To control thresholds for receiving memory usage notifications, we start Postgres
|
||||
in the `neon-postgres` cgroup and set its `memory.{max,high}`.
|
||||
|
||||
* See also: [`neondatabase/autoscaling`](https://github.com/neondatabase/autoscaling/)
|
||||
* See also: [`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor/),
|
||||
where initial development of the monitor happened. The repository is no longer
|
||||
maintained but the commit history may be useful for debugging.
|
||||
|
||||
## Structure
|
||||
|
||||
The `vm-monitor` is loosely comprised of a few systems. These are:
|
||||
* the server: this is just a simple `axum` server that accepts requests and
|
||||
upgrades them to websocket connections. The server only allows one connection at
|
||||
a time. This means that upon receiving a new connection, the server will terminate
|
||||
and old one if it exists.
|
||||
* the filecache: a struct that allows communication with the Postgres file cache.
|
||||
On startup, we connect to the filecache and hold on to the connection for the
|
||||
entire monitor lifetime.
|
||||
* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
|
||||
listening for `memory.high` events and setting its `memory.{high,max}` values.
|
||||
* the runner: the runner marries the filecache and cgroup watcher together,
|
||||
communicating with the agent throught the `Dispatcher`, and then calling filecache
|
||||
and cgroup watcher functions as needed to upscale and downscale
|
||||
@@ -1,33 +0,0 @@
|
||||
// We expose a standalone binary _and_ start the monitor in `compute_ctl` so that
|
||||
// we can test the monitor as part of the entire autoscaling system in
|
||||
// neondatabase/autoscaling.
|
||||
//
|
||||
// The monitor was previously started by vm-builder, and for testing purposes,
|
||||
// we can mimic that setup with this binary.
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
use clap::Parser;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use vm_monitor::Args;
|
||||
|
||||
let subscriber = tracing_subscriber::fmt::Subscriber::builder()
|
||||
.json()
|
||||
.with_file(true)
|
||||
.with_line_number(true)
|
||||
.with_span_list(true)
|
||||
.with_env_filter(EnvFilter::from_default_env())
|
||||
.finish();
|
||||
tracing::subscriber::set_global_default(subscriber)?;
|
||||
|
||||
let args: &'static Args = Box::leak(Box::new(Args::parse()));
|
||||
let token = CancellationToken::new();
|
||||
vm_monitor::start(args, token).await
|
||||
}
|
||||
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
fn main() {
|
||||
panic!("the monitor requires cgroups, which are only available on linux")
|
||||
}
|
||||
@@ -1,693 +0,0 @@
|
||||
use std::{
|
||||
fmt::{Debug, Display},
|
||||
fs,
|
||||
pin::pin,
|
||||
sync::atomic::{AtomicU64, Ordering},
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use cgroups_rs::{
|
||||
freezer::FreezerController,
|
||||
hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
|
||||
memory::MemController,
|
||||
MaxValue,
|
||||
Subsystem::{Freezer, Mem},
|
||||
};
|
||||
use inotify::{EventStream, Inotify, WatchMask};
|
||||
use tokio::sync::mpsc::{self, error::TryRecvError};
|
||||
use tokio::time::{Duration, Instant};
|
||||
use tokio_stream::{Stream, StreamExt};
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::protocol::Resources;
|
||||
use crate::MiB;
|
||||
|
||||
/// Monotonically increasing counter of the number of memory.high events
|
||||
/// the cgroup has experienced.
|
||||
///
|
||||
/// We use this to determine if a modification to the `memory.events` file actually
|
||||
/// changed the `high` field. If not, we don't care about the change. When we
|
||||
/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
|
||||
/// to see if it changed since last time.
|
||||
pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
/// Monotonically increasing counter that gives each cgroup event a unique id.
|
||||
///
|
||||
/// This allows us to answer questions like "did this upscale arrive before this
|
||||
/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
|
||||
/// with a sequence number. As such, prefer to used the `Sequenced` type rather
|
||||
/// than this static directly.
|
||||
static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
/// A memory event type reported in memory.events.
|
||||
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
|
||||
pub enum MemoryEvent {
|
||||
Low,
|
||||
High,
|
||||
Max,
|
||||
Oom,
|
||||
OomKill,
|
||||
OomGroupKill,
|
||||
}
|
||||
|
||||
impl MemoryEvent {
|
||||
fn as_str(&self) -> &str {
|
||||
match self {
|
||||
MemoryEvent::Low => "low",
|
||||
MemoryEvent::High => "high",
|
||||
MemoryEvent::Max => "max",
|
||||
MemoryEvent::Oom => "oom",
|
||||
MemoryEvent::OomKill => "oom_kill",
|
||||
MemoryEvent::OomGroupKill => "oom_group_kill",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for MemoryEvent {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for a `CgroupWatcher`
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Config {
|
||||
// The target difference between the total memory reserved for the cgroup
|
||||
// and the value of the cgroup's memory.high.
|
||||
//
|
||||
// In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
|
||||
// use (equal to system memory, minus whatever's taken out for the file cache).
|
||||
oom_buffer_bytes: u64,
|
||||
|
||||
// The amount of memory, in bytes, below a proposed new value for
|
||||
// memory.high that the cgroup's memory usage must be for us to downscale
|
||||
//
|
||||
// In other words, we can downscale only when:
|
||||
//
|
||||
// memory.current + memory_high_buffer_bytes < (proposed) memory.high
|
||||
//
|
||||
// TODO: there's some minor issues with this approach -- in particular, that we might have
|
||||
// memory in use by the kernel's page cache that we're actually ok with getting rid of.
|
||||
pub(crate) memory_high_buffer_bytes: u64,
|
||||
|
||||
// The maximum duration, in milliseconds, that we're allowed to pause
|
||||
// the cgroup for while waiting for the autoscaler-agent to upscale us
|
||||
max_upscale_wait: Duration,
|
||||
|
||||
// The required minimum time, in milliseconds, that we must wait before re-freezing
|
||||
// the cgroup while waiting for the autoscaler-agent to upscale us.
|
||||
do_not_freeze_more_often_than: Duration,
|
||||
|
||||
// The amount of memory, in bytes, that we should periodically increase memory.high
|
||||
// by while waiting for the autoscaler-agent to upscale us.
|
||||
//
|
||||
// This exists to avoid the excessive throttling that happens when a cgroup is above its
|
||||
// memory.high for too long. See more here:
|
||||
// https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
|
||||
memory_high_increase_by_bytes: u64,
|
||||
|
||||
// The period, in milliseconds, at which we should repeatedly increase the value
|
||||
// of the cgroup's memory.high while we're waiting on upscaling and memory.high
|
||||
// is still being hit.
|
||||
//
|
||||
// Technically speaking, this actually serves as a rate limit to moderate responding to
|
||||
// memory.high events, but these are roughly equivalent if the process is still allocating
|
||||
// memory.
|
||||
memory_high_increase_every: Duration,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Calculate the new value for the cgroups memory.high based on system memory
|
||||
pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
|
||||
total_system_mem.saturating_sub(self.oom_buffer_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
oom_buffer_bytes: 100 * MiB,
|
||||
memory_high_buffer_bytes: 100 * MiB,
|
||||
// while waiting for upscale, don't freeze for more than 20ms every 1s
|
||||
max_upscale_wait: Duration::from_millis(20),
|
||||
do_not_freeze_more_often_than: Duration::from_millis(1000),
|
||||
// while waiting for upscale, increase memory.high by 10MiB every 25ms
|
||||
memory_high_increase_by_bytes: 10 * MiB,
|
||||
memory_high_increase_every: Duration::from_millis(25),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Used to represent data that is associated with a certain point in time, such
|
||||
/// as an upscale request or memory.high event.
|
||||
///
|
||||
/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
|
||||
/// a unique sequence number. Sequence numbers are monotonically increasing,
|
||||
/// allowing us to answer questions like "did this upscale happen after this
|
||||
/// memory.high event?" by comparing the sequence numbers of the two events.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Sequenced<T> {
|
||||
seqnum: u64,
|
||||
data: T,
|
||||
}
|
||||
|
||||
impl<T> Sequenced<T> {
|
||||
pub fn new(data: T) -> Self {
|
||||
Self {
|
||||
seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
|
||||
data,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Responds to `MonitorEvents` to manage the cgroup: preventing it from being
|
||||
/// OOM killed or throttling.
|
||||
///
|
||||
/// The `CgroupWatcher` primarily achieves this by reading from a stream of
|
||||
/// `MonitorEvent`s. See `main_signals_loop` for details on how to keep the
|
||||
/// cgroup happy.
|
||||
#[derive(Debug)]
|
||||
pub struct CgroupWatcher {
|
||||
pub config: Config,
|
||||
|
||||
/// The sequence number of the last upscale.
|
||||
///
|
||||
/// If we receive a memory.high event that has a _lower_ sequence number than
|
||||
/// `last_upscale_seqnum`, then we know it occured before the upscale, and we
|
||||
/// can safely ignore it.
|
||||
///
|
||||
/// Note: Like the `events` field, this doesn't _need_ interior mutability but we
|
||||
/// use it anyways so that methods take `&self`, not `&mut self`.
|
||||
last_upscale_seqnum: AtomicU64,
|
||||
|
||||
/// A channel on which we send messages to request upscale from the dispatcher.
|
||||
upscale_requester: mpsc::Sender<()>,
|
||||
|
||||
/// The actual cgroup we are watching and managing.
|
||||
cgroup: cgroups_rs::Cgroup,
|
||||
}
|
||||
|
||||
/// Read memory.events for the desired event type.
|
||||
///
|
||||
/// `path` specifies the path to the desired `memory.events` file.
|
||||
/// For more info, see the `memory.events` section of the [kernel docs]
|
||||
/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
|
||||
fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
|
||||
let contents = fs::read_to_string(path)
|
||||
.with_context(|| format!("failed to read memory.events from {path}"))?;
|
||||
|
||||
// Then contents of the file look like:
|
||||
// low 42
|
||||
// high 101
|
||||
// ...
|
||||
contents
|
||||
.lines()
|
||||
.filter_map(|s| s.split_once(' '))
|
||||
.find(|(e, _)| *e == event.as_str())
|
||||
.ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
|
||||
.and_then(|(_, count)| {
|
||||
count
|
||||
.parse::<u64>()
|
||||
.with_context(|| format!("failed to parse memory.{event} as u64"))
|
||||
})
|
||||
}
|
||||
|
||||
/// Create an event stream that produces events whenever the file at the provided
|
||||
/// path is modified.
|
||||
fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
|
||||
info!("creating file watcher for {path}");
|
||||
let inotify = Inotify::init().context("failed to initialize file watcher")?;
|
||||
inotify
|
||||
.watches()
|
||||
.add(path, WatchMask::MODIFY)
|
||||
.with_context(|| format!("failed to start watching {path}"))?;
|
||||
inotify
|
||||
// The inotify docs use [0u8; 1024] so we'll just copy them. We only need
|
||||
// to store one event at a time - if the event gets written over, that's
|
||||
// ok. We still see that there is an event. For more information, see:
|
||||
// https://man7.org/linux/man-pages/man7/inotify.7.html
|
||||
.into_event_stream([0u8; 1024])
|
||||
.context("failed to start inotify event stream")
|
||||
}
|
||||
|
||||
impl CgroupWatcher {
|
||||
/// Create a new `CgroupWatcher`.
|
||||
#[tracing::instrument(skip_all, fields(%name))]
|
||||
pub fn new(
|
||||
name: String,
|
||||
// A channel on which to send upscale requests
|
||||
upscale_requester: mpsc::Sender<()>,
|
||||
) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
|
||||
// TODO: clarify exactly why we need v2
|
||||
// Make sure cgroups v2 (aka unified) are supported
|
||||
if !is_cgroup2_unified_mode() {
|
||||
anyhow::bail!("cgroups v2 not supported");
|
||||
}
|
||||
let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);
|
||||
|
||||
// Start monitoring the cgroup for memory events. In general, for
|
||||
// cgroups v2 (aka unified), metrics are reported in files like
|
||||
// > `/sys/fs/cgroup/{name}/{metric}`
|
||||
// We are looking for `memory.high` events, which are stored in the
|
||||
// file `memory.events`. For more info, see the `memory.events` section
|
||||
// of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
|
||||
let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
|
||||
let memory_events = create_file_watcher(&path)
|
||||
.with_context(|| format!("failed to create event watcher for {path}"))?
|
||||
// This would be nice with with .inspect_err followed by .ok
|
||||
.filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
|
||||
Ok(high) => Some(high),
|
||||
Err(error) => {
|
||||
// TODO: Might want to just panic here
|
||||
warn!(?error, "failed to read high events count from {}", &path);
|
||||
None
|
||||
}
|
||||
})
|
||||
// Only report the event if the memory.high count increased
|
||||
.filter_map(|high| {
|
||||
if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
|
||||
Some(high)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.map(Sequenced::new);
|
||||
|
||||
let initial_count = get_event_count(
|
||||
&format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
|
||||
MemoryEvent::High,
|
||||
)?;
|
||||
|
||||
info!(initial_count, "initial memory.high event count");
|
||||
|
||||
// Hard update `MEMORY_EVENT_COUNT` since there could have been processes
|
||||
// running in the cgroup before that caused it to be non-zero.
|
||||
MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
|
||||
|
||||
Ok((
|
||||
Self {
|
||||
cgroup,
|
||||
upscale_requester,
|
||||
last_upscale_seqnum: AtomicU64::new(0),
|
||||
config: Default::default(),
|
||||
},
|
||||
memory_events,
|
||||
))
|
||||
}
|
||||
|
||||
/// The entrypoint for the `CgroupWatcher`.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn watch<E>(
|
||||
&self,
|
||||
// These are ~dependency injected~ (fancy, I know) because this function
|
||||
// should never return.
|
||||
// -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
|
||||
// -> therefore: if we want to stick it in an Arc so many threads can access
|
||||
// it, methods can never take mutable access.
|
||||
// - note: we use the Arc strategy so that a) we can call this function
|
||||
// right here and b) the runner can call the set/get_memory methods
|
||||
// -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
|
||||
// we just pass them in here instead of holding them in fields, as that
|
||||
// would require this method to take &mut self.
|
||||
mut upscales: mpsc::Receiver<Sequenced<Resources>>,
|
||||
events: E,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
E: Stream<Item = Sequenced<u64>>,
|
||||
{
|
||||
// There are several actions might do when receiving a `memory.high`,
|
||||
// such as freezing the cgroup, or increasing its `memory.high`. We don't
|
||||
// want to do these things too often (because postgres needs to run, and
|
||||
// we only have so much memory). These timers serve as rate limits for this.
|
||||
let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
|
||||
let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
|
||||
let mut events = pin!(events);
|
||||
|
||||
// Are we waiting to be upscaled? Could be true if we request upscale due
|
||||
// to a memory.high event and it does not arrive in time.
|
||||
let mut waiting_on_upscale = false;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
upscale = upscales.recv() => {
|
||||
let Sequenced { seqnum, data } = upscale
|
||||
.context("failed to listen on upscale notification channel")?;
|
||||
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
|
||||
info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
|
||||
}
|
||||
event = events.next() => {
|
||||
let Some(Sequenced { seqnum, .. }) = event else {
|
||||
bail!("failed to listen for memory.high events")
|
||||
};
|
||||
// The memory.high came before our last upscale, so we consider
|
||||
// it resolved
|
||||
if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
|
||||
info!(
|
||||
"received memory.high event, but it came before our last upscale -> ignoring it"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// The memory.high came after our latest upscale. We don't
|
||||
// want to do anything yet, so peek the next event in hopes
|
||||
// that it's an upscale.
|
||||
if let Some(upscale_num) = self
|
||||
.upscaled(&mut upscales)
|
||||
.context("failed to check if we were upscaled")?
|
||||
{
|
||||
if upscale_num > seqnum {
|
||||
info!(
|
||||
"received memory.high event, but it came before our last upscale -> ignoring it"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// If it's been long enough since we last froze, freeze the
|
||||
// cgroup and request upscale
|
||||
if wait_to_freeze.is_elapsed() {
|
||||
info!("received memory.high event -> requesting upscale");
|
||||
waiting_on_upscale = self
|
||||
.handle_memory_high_event(&mut upscales)
|
||||
.await
|
||||
.context("failed to handle upscale")?;
|
||||
wait_to_freeze
|
||||
.as_mut()
|
||||
.reset(Instant::now() + self.config.do_not_freeze_more_often_than);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Ok, we can't freeze, just request upscale
|
||||
if !waiting_on_upscale {
|
||||
info!("received memory.high event, but too soon to refreeze -> requesting upscale");
|
||||
|
||||
// Make check to make sure we haven't been upscaled in the
|
||||
// meantine (can happen if the agent independently decides
|
||||
// to upscale us again)
|
||||
if self
|
||||
.upscaled(&mut upscales)
|
||||
.context("failed to check if we were upscaled")?
|
||||
.is_some()
|
||||
{
|
||||
info!("no need to request upscaling because we got upscaled");
|
||||
continue;
|
||||
}
|
||||
self.upscale_requester
|
||||
.send(())
|
||||
.await
|
||||
.context("failed to request upscale")?;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Shoot, we can't freeze or and we're still waiting on upscale,
|
||||
// increase memory.high to reduce throttling
|
||||
if wait_to_increase_memory_high.is_elapsed() {
|
||||
info!(
|
||||
"received memory.high event, \
|
||||
but too soon to refreeze and already requested upscale \
|
||||
-> increasing memory.high"
|
||||
);
|
||||
|
||||
// Make check to make sure we haven't been upscaled in the
|
||||
// meantine (can happen if the agent independently decides
|
||||
// to upscale us again)
|
||||
if self
|
||||
.upscaled(&mut upscales)
|
||||
.context("failed to check if we were upscaled")?
|
||||
.is_some()
|
||||
{
|
||||
info!("no need to increase memory.high because got upscaled");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Request upscale anyways (the agent will handle deduplicating
|
||||
// requests)
|
||||
self.upscale_requester
|
||||
.send(())
|
||||
.await
|
||||
.context("failed to request upscale")?;
|
||||
|
||||
let memory_high =
|
||||
self.get_high_bytes().context("failed to get memory.high")?;
|
||||
let new_high = memory_high + self.config.memory_high_increase_by_bytes;
|
||||
info!(
|
||||
current_high_bytes = memory_high,
|
||||
new_high_bytes = new_high,
|
||||
"updating memory.high"
|
||||
);
|
||||
self.set_high_bytes(new_high)
|
||||
.context("failed to set memory.high")?;
|
||||
wait_to_increase_memory_high
|
||||
.as_mut()
|
||||
.reset(Instant::now() + self.config.memory_high_increase_every)
|
||||
}
|
||||
|
||||
// we can't do anything
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle a `memory.high`, returning whether we are still waiting on upscale
|
||||
/// by the time the function returns.
|
||||
///
|
||||
/// The general plan for handling a `memory.high` event is as follows:
|
||||
/// 1. Freeze the cgroup
|
||||
/// 2. Start a timer for `self.config.max_upscale_wait`
|
||||
/// 3. Request upscale
|
||||
/// 4. After the timer elapses or we receive upscale, thaw the cgroup.
|
||||
/// 5. Return whether or not we are still waiting for upscale. If we are,
|
||||
/// we'll increase the cgroups memory.high to avoid getting oom killed
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn handle_memory_high_event(
|
||||
&self,
|
||||
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
|
||||
) -> anyhow::Result<bool> {
|
||||
// Immediately freeze the cgroup before doing anything else.
|
||||
info!("received memory.high event -> freezing cgroup");
|
||||
self.freeze().context("failed to freeze cgroup")?;
|
||||
|
||||
// We'll use this for logging durations
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Await the upscale until we have to unfreeze
|
||||
let timed =
|
||||
tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
|
||||
|
||||
// Request the upscale
|
||||
info!(
|
||||
wait = ?self.config.max_upscale_wait,
|
||||
"sending request for immediate upscaling",
|
||||
);
|
||||
self.upscale_requester
|
||||
.send(())
|
||||
.await
|
||||
.context("failed to request upscale")?;
|
||||
|
||||
let waiting_on_upscale = match timed.await {
|
||||
Ok(Ok(())) => {
|
||||
info!(elapsed = ?start_time.elapsed(), "received upscale in time");
|
||||
false
|
||||
}
|
||||
// **important**: unfreeze the cgroup before ?-reporting the error
|
||||
Ok(Err(e)) => {
|
||||
info!("error waiting for upscale -> thawing cgroup");
|
||||
self.thaw()
|
||||
.context("failed to thaw cgroup after errored waiting for upscale")?;
|
||||
Err(e.context("failed to await upscale"))?
|
||||
}
|
||||
Err(_) => {
|
||||
info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
info!("thawing cgroup");
|
||||
self.thaw().context("failed to thaw cgroup")?;
|
||||
|
||||
Ok(waiting_on_upscale)
|
||||
}
|
||||
|
||||
/// Checks whether we were just upscaled, returning the upscale's sequence
|
||||
/// number if so.
|
||||
#[tracing::instrument(skip_all)]
|
||||
fn upscaled(
|
||||
&self,
|
||||
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
|
||||
) -> anyhow::Result<Option<u64>> {
|
||||
let Sequenced { seqnum, data } = match upscales.try_recv() {
|
||||
Ok(upscale) => upscale,
|
||||
Err(TryRecvError::Empty) => return Ok(None),
|
||||
Err(TryRecvError::Disconnected) => {
|
||||
bail!("upscale notification channel was disconnected")
|
||||
}
|
||||
};
|
||||
|
||||
// Make sure to update the last upscale sequence number
|
||||
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
|
||||
info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
|
||||
Ok(Some(seqnum))
|
||||
}
|
||||
|
||||
/// Await an upscale event, discarding any `memory.high` events received in
|
||||
/// the process.
|
||||
///
|
||||
/// This is used in `handle_memory_high_event`, where we need to listen
|
||||
/// for upscales in particular so we know if we can thaw the cgroup early.
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn await_upscale(
|
||||
&self,
|
||||
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
|
||||
) -> anyhow::Result<()> {
|
||||
let Sequenced { seqnum, .. } = upscales
|
||||
.recv()
|
||||
.await
|
||||
.context("error listening for upscales")?;
|
||||
|
||||
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the cgroup's name.
|
||||
pub fn path(&self) -> &str {
|
||||
self.cgroup.path()
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a set of limits we apply to a cgroup to control memory usage.
|
||||
///
|
||||
/// Setting these values also affects the thresholds for receiving usage alerts.
|
||||
#[derive(Debug)]
|
||||
pub struct MemoryLimits {
|
||||
high: u64,
|
||||
max: u64,
|
||||
}
|
||||
|
||||
impl MemoryLimits {
|
||||
pub fn new(high: u64, max: u64) -> Self {
|
||||
Self { max, high }
|
||||
}
|
||||
}
|
||||
|
||||
// Methods for manipulating the actual cgroup
|
||||
impl CgroupWatcher {
|
||||
/// Get a handle on the freezer subsystem.
|
||||
fn freezer(&self) -> anyhow::Result<&FreezerController> {
|
||||
if let Some(Freezer(freezer)) = self
|
||||
.cgroup
|
||||
.subsystems()
|
||||
.iter()
|
||||
.find(|sub| matches!(sub, Freezer(_)))
|
||||
{
|
||||
Ok(freezer)
|
||||
} else {
|
||||
anyhow::bail!("could not find freezer subsystem")
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt to freeze the cgroup.
|
||||
pub fn freeze(&self) -> anyhow::Result<()> {
|
||||
self.freezer()
|
||||
.context("failed to get freezer subsystem")?
|
||||
.freeze()
|
||||
.context("failed to freeze")
|
||||
}
|
||||
|
||||
/// Attempt to thaw the cgroup.
|
||||
pub fn thaw(&self) -> anyhow::Result<()> {
|
||||
self.freezer()
|
||||
.context("failed to get freezer subsystem")?
|
||||
.thaw()
|
||||
.context("failed to thaw")
|
||||
}
|
||||
|
||||
/// Get a handle on the memory subsystem.
|
||||
///
|
||||
/// Note: this method does not require `self.memory_update_lock` because
|
||||
/// getting a handle to the subsystem does not access any of the files we
|
||||
/// care about, such as memory.high and memory.events
|
||||
fn memory(&self) -> anyhow::Result<&MemController> {
|
||||
if let Some(Mem(memory)) = self
|
||||
.cgroup
|
||||
.subsystems()
|
||||
.iter()
|
||||
.find(|sub| matches!(sub, Mem(_)))
|
||||
{
|
||||
Ok(memory)
|
||||
} else {
|
||||
anyhow::bail!("could not find memory subsystem")
|
||||
}
|
||||
}
|
||||
|
||||
/// Get cgroup current memory usage.
|
||||
pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
|
||||
Ok(self
|
||||
.memory()
|
||||
.context("failed to get memory subsystem")?
|
||||
.memory_stat()
|
||||
.usage_in_bytes)
|
||||
}
|
||||
|
||||
/// Set cgroup memory.high threshold.
|
||||
pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
|
||||
self.memory()
|
||||
.context("failed to get memory subsystem")?
|
||||
.set_mem(cgroups_rs::memory::SetMemory {
|
||||
low: None,
|
||||
high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
|
||||
min: None,
|
||||
max: None,
|
||||
})
|
||||
.context("failed to set memory.high")
|
||||
}
|
||||
|
||||
/// Set cgroup memory.high and memory.max.
|
||||
pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
|
||||
info!(
|
||||
limits.high,
|
||||
limits.max,
|
||||
path = self.path(),
|
||||
"writing new memory limits",
|
||||
);
|
||||
self.memory()
|
||||
.context("failed to get memory subsystem while setting memory limits")?
|
||||
.set_mem(cgroups_rs::memory::SetMemory {
|
||||
min: None,
|
||||
low: None,
|
||||
high: Some(MaxValue::Value(
|
||||
u64::min(limits.high, i64::MAX as u64) as i64
|
||||
)),
|
||||
max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
|
||||
})
|
||||
.context("failed to set memory limits")
|
||||
}
|
||||
|
||||
/// Given some amount of available memory, set the desired cgroup memory limits
|
||||
pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
|
||||
let new_high = self.config.calculate_memory_high_value(available_memory);
|
||||
let limits = MemoryLimits::new(new_high, available_memory);
|
||||
info!(
|
||||
path = self.path(),
|
||||
memory = ?limits,
|
||||
"setting cgroup memory",
|
||||
);
|
||||
self.set_limits(&limits)
|
||||
.context("failed to set cgroup memory limits")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get memory.high threshold.
|
||||
pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
|
||||
let high = self
|
||||
.memory()
|
||||
.context("failed to get memory subsystem while getting memory statistics")?
|
||||
.get_mem()
|
||||
.map(|mem| mem.high)
|
||||
.context("failed to get memory statistics from subsystem")?;
|
||||
match high {
|
||||
Some(MaxValue::Max) => Ok(i64::MAX as u64),
|
||||
Some(MaxValue::Value(high)) => Ok(high as u64),
|
||||
None => anyhow::bail!("failed to read memory.high from memory subsystem"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,153 +0,0 @@
|
||||
//! Managing the websocket connection and other signals in the monitor.
|
||||
//!
|
||||
//! Contains types that manage the interaction (not data interchange, see `protocol`)
|
||||
//! between agent and monitor, allowing us to to process and send messages in a
|
||||
//! straightforward way. The dispatcher also manages that signals that come from
|
||||
//! the cgroup (requesting upscale), and the signals that go to the cgroup
|
||||
//! (notifying it of upscale).
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use axum::extract::ws::{Message, WebSocket};
|
||||
use futures::{
|
||||
stream::{SplitSink, SplitStream},
|
||||
SinkExt, StreamExt,
|
||||
};
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::info;
|
||||
|
||||
use crate::cgroup::Sequenced;
|
||||
use crate::protocol::{
|
||||
OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
|
||||
PROTOCOL_MIN_VERSION,
|
||||
};
|
||||
|
||||
/// The central handler for all communications in the monitor.
|
||||
///
|
||||
/// The dispatcher has two purposes:
|
||||
/// 1. Manage the connection to the agent, sending and receiving messages.
|
||||
/// 2. Communicate with the cgroup manager, notifying it when upscale is received,
|
||||
/// and sending a message to the agent when the cgroup manager requests
|
||||
/// upscale.
|
||||
#[derive(Debug)]
|
||||
pub struct Dispatcher {
|
||||
/// We read agent messages of of `source`
|
||||
pub(crate) source: SplitStream<WebSocket>,
|
||||
|
||||
/// We send messages to the agent through `sink`
|
||||
sink: SplitSink<WebSocket, Message>,
|
||||
|
||||
/// Used to notify the cgroup when we are upscaled.
|
||||
pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
|
||||
|
||||
/// When the cgroup requests upscale it will send on this channel. In response
|
||||
/// we send an `UpscaleRequst` to the agent.
|
||||
pub(crate) request_upscale_events: mpsc::Receiver<()>,
|
||||
|
||||
/// The protocol version we have agreed to use with the agent. This is negotiated
|
||||
/// during the creation of the dispatcher, and should be the highest shared protocol
|
||||
/// version.
|
||||
///
|
||||
// NOTE: currently unused, but will almost certainly be used in the futures
|
||||
// as the protocol changes
|
||||
#[allow(unused)]
|
||||
pub(crate) proto_version: ProtocolVersion,
|
||||
}
|
||||
|
||||
impl Dispatcher {
|
||||
/// Creates a new dispatcher using the passed-in connection.
|
||||
///
|
||||
/// Performs a negotiation with the agent to determine the highest protocol
|
||||
/// version that both support. This consists of two steps:
|
||||
/// 1. Wait for the agent to sent the range of protocols it supports.
|
||||
/// 2. Send a protocol version that works for us as well, or an error if there
|
||||
/// is no compatible version.
|
||||
pub async fn new(
|
||||
stream: WebSocket,
|
||||
notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
|
||||
request_upscale_events: mpsc::Receiver<()>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let (mut sink, mut source) = stream.split();
|
||||
|
||||
// Figure out the highest protocol version we both support
|
||||
info!("waiting for agent to send protocol version range");
|
||||
let Some(message) = source.next().await else {
|
||||
bail!("websocket connection closed while performing protocol handshake")
|
||||
};
|
||||
|
||||
let message = message.context("failed to read protocol version range off connection")?;
|
||||
|
||||
let Message::Text(message_text) = message else {
|
||||
// All messages should be in text form, since we don't do any
|
||||
// pinging/ponging. See nhooyr/websocket's implementation and the
|
||||
// agent for more info
|
||||
bail!("received non-text message during proocol handshake: {message:?}")
|
||||
};
|
||||
|
||||
let monitor_range = ProtocolRange {
|
||||
min: PROTOCOL_MIN_VERSION,
|
||||
max: PROTOCOL_MAX_VERSION,
|
||||
};
|
||||
|
||||
let agent_range: ProtocolRange = serde_json::from_str(&message_text)
|
||||
.context("failed to deserialize protocol version range")?;
|
||||
|
||||
info!(range = ?agent_range, "received protocol version range");
|
||||
|
||||
let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) {
|
||||
Ok(version) => {
|
||||
sink.send(Message::Text(
|
||||
serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
|
||||
))
|
||||
.await
|
||||
.context("failed to notify agent of negotiated protocol version")?;
|
||||
version
|
||||
}
|
||||
Err(e) => {
|
||||
sink.send(Message::Text(
|
||||
serde_json::to_string(&ProtocolResponse::Error(format!(
|
||||
"Received protocol version range {} which does not overlap with {}",
|
||||
agent_range, monitor_range
|
||||
)))
|
||||
.unwrap(),
|
||||
))
|
||||
.await
|
||||
.context("failed to notify agent of no overlap between protocol version ranges")?;
|
||||
Err(e).context("error determining suitable protocol version range")?
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
sink,
|
||||
source,
|
||||
notify_upscale_events,
|
||||
request_upscale_events,
|
||||
proto_version: highest_shared_version,
|
||||
})
|
||||
}
|
||||
|
||||
/// Notify the cgroup manager that we have received upscale and wait for
|
||||
/// the acknowledgement.
|
||||
#[tracing::instrument(skip_all, fields(?resources))]
|
||||
pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
|
||||
self.notify_upscale_events
|
||||
.send(resources)
|
||||
.await
|
||||
.context("failed to send resources and oneshot sender across channel")
|
||||
}
|
||||
|
||||
/// Send a message to the agent.
|
||||
///
|
||||
/// Although this function is small, it has one major benefit: it is the only
|
||||
/// way to send data accross the connection, and you can only pass in a proper
|
||||
/// `MonitorMessage`. Without safeguards like this, it's easy to accidentally
|
||||
/// serialize the wrong thing and send it, since `self.sink.send` will take
|
||||
/// any string.
|
||||
pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> {
|
||||
info!(?message, "sending message");
|
||||
let json = serde_json::to_string(&message).context("failed to serialize message")?;
|
||||
self.sink
|
||||
.send(Message::Text(json))
|
||||
.await
|
||||
.context("stream error sending message")
|
||||
}
|
||||
}
|
||||
@@ -1,316 +0,0 @@
|
||||
//! Logic for configuring and scaling the Postgres file cache.
|
||||
|
||||
use std::num::NonZeroU64;
|
||||
|
||||
use crate::MiB;
|
||||
use anyhow::{anyhow, Context};
|
||||
use tokio_postgres::{types::ToSql, Client, NoTls, Row};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info};
|
||||
|
||||
/// Manages Postgres' file cache by keeping a connection open.
|
||||
#[derive(Debug)]
|
||||
pub struct FileCacheState {
|
||||
client: Client,
|
||||
conn_str: String,
|
||||
pub(crate) config: FileCacheConfig,
|
||||
|
||||
/// A token for cancelling spawned threads during shutdown.
|
||||
token: CancellationToken,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FileCacheConfig {
|
||||
/// Whether the file cache is *actually* stored in memory (e.g. by writing to
|
||||
/// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
|
||||
/// memory available for the cgroup.
|
||||
pub(crate) in_memory: bool,
|
||||
|
||||
/// The size of the file cache, in terms of the size of the resource it consumes
|
||||
/// (currently: only memory)
|
||||
///
|
||||
/// For example, setting `resource_multipler = 0.75` gives the cache a target size of 75% of total
|
||||
/// resources.
|
||||
///
|
||||
/// This value must be strictly between 0 and 1.
|
||||
resource_multiplier: f64,
|
||||
|
||||
/// The required minimum amount of memory, in bytes, that must remain available
|
||||
/// after subtracting the file cache.
|
||||
///
|
||||
/// This value must be non-zero.
|
||||
min_remaining_after_cache: NonZeroU64,
|
||||
|
||||
/// Controls the rate of increase in the file cache's size as it grows from zero
|
||||
/// (when total resources equals min_remaining_after_cache) to the desired size based on
|
||||
/// `resource_multiplier`.
|
||||
///
|
||||
/// A `spread_factor` of zero means that all additional resources will go to the cache until it
|
||||
/// reaches the desired size. Setting `spread_factor` to N roughly means "for every 1 byte added to
|
||||
/// the cache's size, N bytes are reserved for the rest of the system, until the cache gets to
|
||||
/// its desired size".
|
||||
///
|
||||
/// This value must be >= 0, and must retain an increase that is more than what would be given by
|
||||
/// `resource_multiplier`. For example, setting `resource_multiplier` = 0.75 but `spread_factor` = 1
|
||||
/// would be invalid, because `spread_factor` would induce only 50% usage - never reaching the 75%
|
||||
/// as desired by `resource_multiplier`.
|
||||
///
|
||||
/// `spread_factor` is too large if `(spread_factor + 1) * resource_multiplier >= 1`.
|
||||
spread_factor: f64,
|
||||
}
|
||||
|
||||
impl FileCacheConfig {
|
||||
pub fn default_in_memory() -> Self {
|
||||
Self {
|
||||
in_memory: true,
|
||||
// 75 %
|
||||
resource_multiplier: 0.75,
|
||||
// 640 MiB; (512 + 128)
|
||||
min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
|
||||
// ensure any increase in file cache size is split 90-10 with 10% to other memory
|
||||
spread_factor: 0.1,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn default_on_disk() -> Self {
|
||||
Self {
|
||||
in_memory: false,
|
||||
resource_multiplier: 0.75,
|
||||
// 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
|
||||
// memory, the kernel will just evict from its page cache, rather than e.g. killing
|
||||
// everything.
|
||||
min_remaining_after_cache: NonZeroU64::new(256 * MiB).unwrap(),
|
||||
spread_factor: 0.1,
|
||||
}
|
||||
}
|
||||
|
||||
/// Make sure fields of the config are consistent.
|
||||
pub fn validate(&self) -> anyhow::Result<()> {
|
||||
// Single field validity
|
||||
anyhow::ensure!(
|
||||
0.0 < self.resource_multiplier && self.resource_multiplier < 1.0,
|
||||
"resource_multiplier must be between 0.0 and 1.0 exclusive, got {}",
|
||||
self.resource_multiplier
|
||||
);
|
||||
anyhow::ensure!(
|
||||
self.spread_factor >= 0.0,
|
||||
"spread_factor must be >= 0, got {}",
|
||||
self.spread_factor
|
||||
);
|
||||
|
||||
// Check that `resource_multiplier` and `spread_factor` are valid w.r.t. each other.
|
||||
//
|
||||
// As shown in `calculate_cache_size`, we have two lines resulting from `resource_multiplier` and
|
||||
// `spread_factor`, respectively. They are:
|
||||
//
|
||||
// `total` `min_remaining_after_cache`
|
||||
// size = ————————————————————— - —————————————————————————————
|
||||
// `spread_factor` + 1 `spread_factor` + 1
|
||||
//
|
||||
// and
|
||||
//
|
||||
// size = `resource_multiplier` × total
|
||||
//
|
||||
// .. where `total` is the total resources. These are isomorphic to the typical 'y = mx + b'
|
||||
// form, with y = "size" and x = "total".
|
||||
//
|
||||
// These lines intersect at:
|
||||
//
|
||||
// `min_remaining_after_cache`
|
||||
// ———————————————————————————————————————————————————
|
||||
// 1 - `resource_multiplier` × (`spread_factor` + 1)
|
||||
//
|
||||
// We want to ensure that this value (a) exists, and (b) is >= `min_remaining_after_cache`. This is
|
||||
// guaranteed when '`resource_multiplier` × (`spread_factor` + 1)' is less than 1.
|
||||
// (We also need it to be >= 0, but that's already guaranteed.)
|
||||
|
||||
let intersect_factor = self.resource_multiplier * (self.spread_factor + 1.0);
|
||||
anyhow::ensure!(
|
||||
intersect_factor < 1.0,
|
||||
"incompatible resource_multipler and spread_factor"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Calculate the desired size of the cache, given the total memory
|
||||
pub fn calculate_cache_size(&self, total: u64) -> u64 {
|
||||
// *Note*: all units are in bytes, until the very last line.
|
||||
let available = total.saturating_sub(self.min_remaining_after_cache.get());
|
||||
if available == 0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Conversions to ensure we don't overflow from floating-point ops
|
||||
let size_from_spread =
|
||||
i64::max(0, (available as f64 / (1.0 + self.spread_factor)) as i64) as u64;
|
||||
|
||||
let size_from_normal = (total as f64 * self.resource_multiplier) as u64;
|
||||
|
||||
let byte_size = u64::min(size_from_spread, size_from_normal);
|
||||
|
||||
// The file cache operates in units of mebibytes, so the sizes we produce should
|
||||
// be rounded to a mebibyte. We round down to be conservative.
|
||||
byte_size / MiB * MiB
|
||||
}
|
||||
}
|
||||
|
||||
impl FileCacheState {
|
||||
/// Connect to the file cache.
|
||||
#[tracing::instrument(skip_all, fields(%conn_str, ?config))]
|
||||
pub async fn new(
|
||||
conn_str: &str,
|
||||
config: FileCacheConfig,
|
||||
token: CancellationToken,
|
||||
) -> anyhow::Result<Self> {
|
||||
config.validate().context("file cache config is invalid")?;
|
||||
|
||||
info!(conn_str, "connecting to Postgres file cache");
|
||||
let client = FileCacheState::connect(conn_str, token.clone())
|
||||
.await
|
||||
.context("failed to connect to postgres file cache")?;
|
||||
|
||||
let conn_str = conn_str.to_string();
|
||||
Ok(Self {
|
||||
client,
|
||||
config,
|
||||
conn_str,
|
||||
token,
|
||||
})
|
||||
}
|
||||
|
||||
/// Connect to Postgres.
|
||||
///
|
||||
/// Aborts the spawned thread if the kill signal is received. This is not
|
||||
/// a method as it is called in [`FileCacheState::new`].
|
||||
#[tracing::instrument(skip_all, fields(%conn_str))]
|
||||
async fn connect(conn_str: &str, token: CancellationToken) -> anyhow::Result<Client> {
|
||||
let (client, conn) = tokio_postgres::connect(conn_str, NoTls)
|
||||
.await
|
||||
.context("failed to connect to pg client")?;
|
||||
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own. See tokio-postgres docs.
|
||||
crate::spawn_with_cancel(
|
||||
token,
|
||||
|res| {
|
||||
if let Err(error) = res {
|
||||
error!(%error, "postgres error")
|
||||
}
|
||||
},
|
||||
conn,
|
||||
);
|
||||
|
||||
Ok(client)
|
||||
}
|
||||
|
||||
/// Execute a query with a retry if necessary.
|
||||
///
|
||||
/// If the initial query fails, we restart the database connection and attempt
|
||||
/// if again.
|
||||
#[tracing::instrument(skip_all, fields(%statement))]
|
||||
pub async fn query_with_retry(
|
||||
&mut self,
|
||||
statement: &str,
|
||||
params: &[&(dyn ToSql + Sync)],
|
||||
) -> anyhow::Result<Vec<Row>> {
|
||||
match self
|
||||
.client
|
||||
.query(statement, params)
|
||||
.await
|
||||
.context("failed to execute query")
|
||||
{
|
||||
Ok(rows) => Ok(rows),
|
||||
Err(e) => {
|
||||
error!(error = ?e, "postgres error: {e} -> retrying");
|
||||
|
||||
let client = FileCacheState::connect(&self.conn_str, self.token.clone())
|
||||
.await
|
||||
.context("failed to connect to postgres file cache")?;
|
||||
info!("successfully reconnected to postgres client");
|
||||
|
||||
// Replace the old client and attempt the query with the new one
|
||||
self.client = client;
|
||||
self.client
|
||||
.query(statement, params)
|
||||
.await
|
||||
.context("failed to execute query a second time")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the current size of the file cache.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn get_file_cache_size(&mut self) -> anyhow::Result<u64> {
|
||||
self.query_with_retry(
|
||||
// The file cache GUC variable is in MiB, but the conversion with
|
||||
// pg_size_bytes means that the end result we get is in bytes.
|
||||
"SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit'));",
|
||||
&[],
|
||||
)
|
||||
.await
|
||||
.context("failed to query pg for file cache size")?
|
||||
.first()
|
||||
.ok_or_else(|| anyhow!("file cache size query returned no rows"))?
|
||||
// pg_size_bytes returns a bigint which is the same as an i64.
|
||||
.try_get::<_, i64>(0)
|
||||
// Since the size of the table is not negative, the cast is sound.
|
||||
.map(|bytes| bytes as u64)
|
||||
.context("failed to extract file cache size from query result")
|
||||
}
|
||||
|
||||
/// Attempt to set the file cache size, returning the size it was actually
|
||||
/// set to.
|
||||
#[tracing::instrument(skip_all, fields(%num_bytes))]
|
||||
pub async fn set_file_cache_size(&mut self, num_bytes: u64) -> anyhow::Result<u64> {
|
||||
let max_bytes = self
|
||||
// The file cache GUC variable is in MiB, but the conversion with pg_size_bytes
|
||||
// means that the end result we get is in bytes.
|
||||
.query_with_retry(
|
||||
"SELECT pg_size_bytes(current_setting('neon.max_file_cache_size'));",
|
||||
&[],
|
||||
)
|
||||
.await
|
||||
.context("failed to query pg for max file cache size")?
|
||||
.first()
|
||||
.ok_or_else(|| anyhow!("max file cache size query returned no rows"))?
|
||||
.try_get::<_, i64>(0)
|
||||
.map(|bytes| bytes as u64)
|
||||
.context("failed to extract max file cache size from query result")?;
|
||||
|
||||
let max_mb = max_bytes / MiB;
|
||||
let num_mb = u64::min(num_bytes, max_bytes) / MiB;
|
||||
|
||||
let capped = if num_bytes > max_bytes {
|
||||
" (capped by maximum size)"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
info!(
|
||||
size = num_mb,
|
||||
max = max_mb,
|
||||
"updating file cache size {capped}",
|
||||
);
|
||||
|
||||
// note: even though the normal ways to get the cache size produce values with trailing "MB"
|
||||
// (hence why we call pg_size_bytes in `get_file_cache_size`'s query), the format
|
||||
// it expects to set the value is "integer number of MB" without trailing units.
|
||||
// For some reason, this *really* wasn't working with normal arguments, so that's
|
||||
// why we're constructing the query here.
|
||||
self.client
|
||||
.query(
|
||||
&format!("ALTER SYSTEM SET neon.file_cache_size_limit = {};", num_mb),
|
||||
&[],
|
||||
)
|
||||
.await
|
||||
.context("failed to change file cache size limit")?;
|
||||
|
||||
// must use pg_reload_conf to have the settings change take effect
|
||||
self.client
|
||||
.execute("SELECT pg_reload_conf();", &[])
|
||||
.await
|
||||
.context("failed to reload config")?;
|
||||
|
||||
Ok(num_mb * MiB)
|
||||
}
|
||||
}
|
||||
@@ -1,215 +0,0 @@
|
||||
#![cfg(target_os = "linux")]
|
||||
|
||||
use anyhow::Context;
|
||||
use axum::{
|
||||
extract::{ws::WebSocket, State, WebSocketUpgrade},
|
||||
response::Response,
|
||||
};
|
||||
use axum::{routing::get, Router, Server};
|
||||
use clap::Parser;
|
||||
use futures::Future;
|
||||
use std::{fmt::Debug, time::Duration};
|
||||
use sysinfo::{RefreshKind, System, SystemExt};
|
||||
use tokio::{sync::broadcast, task::JoinHandle};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info};
|
||||
|
||||
use runner::Runner;
|
||||
|
||||
// Code that interfaces with agent
|
||||
pub mod dispatcher;
|
||||
pub mod protocol;
|
||||
|
||||
pub mod cgroup;
|
||||
pub mod filecache;
|
||||
pub mod runner;
|
||||
|
||||
/// The vm-monitor is an autoscaling component started by compute_ctl.
|
||||
///
|
||||
/// It carries out autoscaling decisions (upscaling/downscaling) and responds to
|
||||
/// memory pressure by making requests to the autoscaler-agent.
|
||||
#[derive(Debug, Parser)]
|
||||
pub struct Args {
|
||||
/// The name of the cgroup we should monitor for memory.high events. This
|
||||
/// is the cgroup that postgres should be running in.
|
||||
#[arg(short, long)]
|
||||
pub cgroup: Option<String>,
|
||||
|
||||
/// The connection string for the Postgres file cache we should manage.
|
||||
#[arg(short, long)]
|
||||
pub pgconnstr: Option<String>,
|
||||
|
||||
/// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
|
||||
/// kernel's page cache), and therefore should not count against available memory.
|
||||
//
|
||||
// NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
|
||||
// than a roundabout way, via whether it's on disk), but in order to be backwards compatible
|
||||
// during the switch away from an in-memory file cache, we had to default to the previous
|
||||
// behavior.
|
||||
#[arg(long)]
|
||||
pub file_cache_on_disk: bool,
|
||||
|
||||
/// The address we should listen on for connection requests. For the
|
||||
/// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
|
||||
#[arg(short, long)]
|
||||
pub addr: String,
|
||||
}
|
||||
|
||||
impl Args {
|
||||
pub fn addr(&self) -> &str {
|
||||
&self.addr
|
||||
}
|
||||
}
|
||||
|
||||
/// The number of bytes in one mebibyte.
|
||||
#[allow(non_upper_case_globals)]
|
||||
const MiB: u64 = 1 << 20;
|
||||
|
||||
/// Convert a quantity in bytes to a quantity in mebibytes, generally for display
|
||||
/// purposes. (Most calculations in this crate use bytes directly)
|
||||
pub fn bytes_to_mebibytes(bytes: u64) -> f32 {
|
||||
(bytes as f32) / (MiB as f32)
|
||||
}
|
||||
|
||||
pub fn get_total_system_memory() -> u64 {
|
||||
System::new_with_specifics(RefreshKind::new().with_memory()).total_memory()
|
||||
}
|
||||
|
||||
/// Global app state for the Axum server
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ServerState {
|
||||
/// Used to close old connections.
|
||||
///
|
||||
/// When a new connection is made, we send a message signalling to the old
|
||||
/// connection to close.
|
||||
pub sender: broadcast::Sender<()>,
|
||||
|
||||
/// Used to cancel all spawned threads in the monitor.
|
||||
pub token: CancellationToken,
|
||||
|
||||
// The CLI args
|
||||
pub args: &'static Args,
|
||||
}
|
||||
|
||||
/// Spawn a thread that may get cancelled by the provided [`CancellationToken`].
|
||||
///
|
||||
/// This is mainly meant to be called with futures that will be pending for a very
|
||||
/// long time, or are not mean to return. If it is not desirable for the future to
|
||||
/// ever resolve, such as in the case of [`cgroup::CgroupWatcher::watch`], the error can
|
||||
/// be logged with `f`.
|
||||
pub fn spawn_with_cancel<T, F>(
|
||||
token: CancellationToken,
|
||||
f: F,
|
||||
future: T,
|
||||
) -> JoinHandle<Option<T::Output>>
|
||||
where
|
||||
T: Future + Send + 'static,
|
||||
T::Output: Send + 'static,
|
||||
F: FnOnce(&T::Output) + Send + 'static,
|
||||
{
|
||||
tokio::spawn(async move {
|
||||
tokio::select! {
|
||||
_ = token.cancelled() => {
|
||||
info!("received global kill signal");
|
||||
None
|
||||
}
|
||||
res = future => {
|
||||
f(&res);
|
||||
Some(res)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// The entrypoint to the binary.
|
||||
///
|
||||
/// Set up tracing, parse arguments, and start an http server.
|
||||
pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Result<()> {
|
||||
// This channel is used to close old connections. When a new connection is
|
||||
// made, we send a message signalling to the old connection to close.
|
||||
let (sender, _) = tokio::sync::broadcast::channel::<()>(1);
|
||||
|
||||
let app = Router::new()
|
||||
// This route gets upgraded to a websocket connection. We only support
|
||||
// one connection at a time, which we enforce by killing old connections
|
||||
// when we receive a new one.
|
||||
.route("/monitor", get(ws_handler))
|
||||
.with_state(ServerState {
|
||||
sender,
|
||||
token,
|
||||
args,
|
||||
});
|
||||
|
||||
let addr = args.addr();
|
||||
let bound = Server::try_bind(&addr.parse().expect("parsing address should not fail"))
|
||||
.with_context(|| format!("failed to bind to {addr}"))?;
|
||||
|
||||
info!(addr, "server bound");
|
||||
|
||||
bound
|
||||
.serve(app.into_make_service())
|
||||
.await
|
||||
.context("server exited")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handles incoming websocket connections.
|
||||
///
|
||||
/// If we are already to connected to an agent, we kill that old connection
|
||||
/// and accept the new one.
|
||||
#[tracing::instrument(name = "/monitor", skip_all, fields(?args))]
|
||||
pub async fn ws_handler(
|
||||
ws: WebSocketUpgrade,
|
||||
State(ServerState {
|
||||
sender,
|
||||
token,
|
||||
args,
|
||||
}): State<ServerState>,
|
||||
) -> Response {
|
||||
// Kill the old monitor
|
||||
info!("closing old connection if there is one");
|
||||
let _ = sender.send(());
|
||||
|
||||
// Start the new one. Wow, the cycle of death and rebirth
|
||||
let closer = sender.subscribe();
|
||||
ws.on_upgrade(|ws| start_monitor(ws, args, closer, token))
|
||||
}
|
||||
|
||||
/// Starts the monitor. If startup fails or the monitor exits, an error will
|
||||
/// be logged and our internal state will be reset to allow for new connections.
|
||||
#[tracing::instrument(skip_all, fields(?args))]
|
||||
async fn start_monitor(
|
||||
ws: WebSocket,
|
||||
args: &Args,
|
||||
kill: broadcast::Receiver<()>,
|
||||
token: CancellationToken,
|
||||
) {
|
||||
info!("accepted new websocket connection -> starting monitor");
|
||||
let timeout = Duration::from_secs(4);
|
||||
let monitor = tokio::time::timeout(
|
||||
timeout,
|
||||
Runner::new(Default::default(), args, ws, kill, token),
|
||||
)
|
||||
.await;
|
||||
let mut monitor = match monitor {
|
||||
Ok(Ok(monitor)) => monitor,
|
||||
Ok(Err(error)) => {
|
||||
error!(?error, "failed to create monitor");
|
||||
return;
|
||||
}
|
||||
Err(_) => {
|
||||
error!(
|
||||
?timeout,
|
||||
"creating monitor timed out (probably waiting to receive protocol range)"
|
||||
);
|
||||
return;
|
||||
}
|
||||
};
|
||||
info!("connected to agent");
|
||||
|
||||
match monitor.run().await {
|
||||
Ok(()) => info!("monitor was killed due to new connection"),
|
||||
Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
|
||||
}
|
||||
}
|
||||
@@ -1,241 +0,0 @@
|
||||
//! Types representing protocols and actual agent-monitor messages.
|
||||
//!
|
||||
//! The pervasive use of serde modifiers throughout this module is to ease
|
||||
//! serialization on the go side. Because go does not have enums (which model
|
||||
//! messages well), it is harder to model messages, and we accomodate that with
|
||||
//! serde.
|
||||
//!
|
||||
//! *Note*: the agent sends and receives messages in different ways.
|
||||
//!
|
||||
//! The agent serializes messages in the form and then sends them. The use
|
||||
//! of `#[serde(tag = "type", content = "content")]` allows us to use `Type`
|
||||
//! to determine how to deserialize `Content`.
|
||||
//! ```ignore
|
||||
//! struct {
|
||||
//! Content any
|
||||
//! Type string
|
||||
//! Id uint64
|
||||
//! }
|
||||
//! ```
|
||||
//! and receives messages in the form:
|
||||
//! ```ignore
|
||||
//! struct {
|
||||
//! {fields embedded}
|
||||
//! Type string
|
||||
//! Id uint64
|
||||
//! }
|
||||
//! ```
|
||||
//! After reading the type field, the agent will decode the entire message
|
||||
//! again, this time into the correct type using the embedded fields.
|
||||
//! Because the agent cannot just extract the json contained in a certain field
|
||||
//! (it initially deserializes to `map[string]interface{}`), we keep the fields
|
||||
//! at the top level, so the entire piece of json can be deserialized into a struct,
|
||||
//! such as a `DownscaleResult`, with the `Type` and `Id` fields ignored.
|
||||
|
||||
use core::fmt;
|
||||
use std::cmp;
|
||||
|
||||
use serde::{de::Error, Deserialize, Serialize};
|
||||
|
||||
/// A Message we send to the agent.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct OutboundMsg {
|
||||
#[serde(flatten)]
|
||||
pub(crate) inner: OutboundMsgKind,
|
||||
pub(crate) id: usize,
|
||||
}
|
||||
|
||||
impl OutboundMsg {
|
||||
pub fn new(inner: OutboundMsgKind, id: usize) -> Self {
|
||||
Self { inner, id }
|
||||
}
|
||||
}
|
||||
|
||||
/// The different underlying message types we can send to the agent.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
#[serde(tag = "type")]
|
||||
pub enum OutboundMsgKind {
|
||||
/// Indicates that the agent sent an invalid message, i.e, we couldn't
|
||||
/// properly deserialize it.
|
||||
InvalidMessage { error: String },
|
||||
/// Indicates that we experienced an internal error while processing a message.
|
||||
/// For example, if a cgroup operation fails while trying to handle an upscale,
|
||||
/// we return `InternalError`.
|
||||
InternalError { error: String },
|
||||
/// Returned to the agent once we have finished handling an upscale. If the
|
||||
/// handling was unsuccessful, an `InternalError` will get returned instead.
|
||||
/// *Note*: this is a struct variant because of the way go serializes struct{}
|
||||
UpscaleConfirmation {},
|
||||
/// Indicates to the monitor that we are urgently requesting resources.
|
||||
/// *Note*: this is a struct variant because of the way go serializes struct{}
|
||||
UpscaleRequest {},
|
||||
/// Returned to the agent once we have finished attempting to downscale. If
|
||||
/// an error occured trying to do so, an `InternalError` will get returned instead.
|
||||
/// However, if we are simply unsuccessful (for example, do to needing the resources),
|
||||
/// that gets included in the `DownscaleResult`.
|
||||
DownscaleResult {
|
||||
// FIXME for the future (once the informant is deprecated)
|
||||
// As of the time of writing, the agent/informant version of this struct is
|
||||
// called api.DownscaleResult. This struct has uppercase fields which are
|
||||
// serialized as such. Thus, we serialize using uppercase names so we don't
|
||||
// have to make a breaking change to the agent<->informant protocol. Once
|
||||
// the informant has been superseded by the monitor, we can add the correct
|
||||
// struct tags to api.DownscaleResult without causing a breaking change,
|
||||
// since we don't need to support the agent<->informant protocol anymore.
|
||||
#[serde(rename = "Ok")]
|
||||
ok: bool,
|
||||
#[serde(rename = "Status")]
|
||||
status: String,
|
||||
},
|
||||
/// Part of the bidirectional heartbeat. The heartbeat is initiated by the
|
||||
/// agent.
|
||||
/// *Note*: this is a struct variant because of the way go serializes struct{}
|
||||
HealthCheck {},
|
||||
}
|
||||
|
||||
/// A message received form the agent.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct InboundMsg {
|
||||
#[serde(flatten)]
|
||||
pub(crate) inner: InboundMsgKind,
|
||||
pub(crate) id: usize,
|
||||
}
|
||||
|
||||
/// The different underlying message types we can receive from the agent.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
#[serde(tag = "type", content = "content")]
|
||||
pub enum InboundMsgKind {
|
||||
/// Indicates that the we sent an invalid message, i.e, we couldn't
|
||||
/// properly deserialize it.
|
||||
InvalidMessage { error: String },
|
||||
/// Indicates that the informan experienced an internal error while processing
|
||||
/// a message. For example, if it failed to request upsacle from the agent, it
|
||||
/// would return an `InternalError`.
|
||||
InternalError { error: String },
|
||||
/// Indicates to us that we have been granted more resources. We should respond
|
||||
/// with an `UpscaleConfirmation` when done handling the resources (increasins
|
||||
/// file cache size, cgorup memory limits).
|
||||
UpscaleNotification { granted: Resources },
|
||||
/// A request to reduce resource usage. We should response with a `DownscaleResult`,
|
||||
/// when done.
|
||||
DownscaleRequest { target: Resources },
|
||||
/// Part of the bidirectional heartbeat. The heartbeat is initiated by the
|
||||
/// agent.
|
||||
/// *Note*: this is a struct variant because of the way go serializes struct{}
|
||||
HealthCheck {},
|
||||
}
|
||||
|
||||
/// Represents the resources granted to a VM.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Copy)]
|
||||
// Renamed because the agent has multiple resources types:
|
||||
// `Resources` (milliCPU/memory slots)
|
||||
// `Allocation` (vCPU/bytes) <- what we correspond to
|
||||
#[serde(rename(serialize = "Allocation", deserialize = "Allocation"))]
|
||||
pub struct Resources {
|
||||
/// Number of vCPUs
|
||||
pub(crate) cpu: f64,
|
||||
/// Bytes of memory
|
||||
pub(crate) mem: u64,
|
||||
}
|
||||
|
||||
impl Resources {
|
||||
pub fn new(cpu: f64, mem: u64) -> Self {
|
||||
Self { cpu, mem }
|
||||
}
|
||||
}
|
||||
|
||||
pub const PROTOCOL_MIN_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
|
||||
pub const PROTOCOL_MAX_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Ord, Eq, Serialize, Deserialize)]
|
||||
pub struct ProtocolVersion(u8);
|
||||
|
||||
impl ProtocolVersion {
|
||||
/// Represents v1.0 of the agent<-> monitor protocol - the initial version
|
||||
///
|
||||
/// Currently the latest version.
|
||||
const V1_0: ProtocolVersion = ProtocolVersion(1);
|
||||
}
|
||||
|
||||
impl fmt::Display for ProtocolVersion {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match *self {
|
||||
ProtocolVersion(0) => f.write_str("<invalid: zero>"),
|
||||
ProtocolVersion::V1_0 => f.write_str("v1.0"),
|
||||
other => write!(f, "<unknown: {other}>"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of protocol bounds that determines what we are speaking.
|
||||
///
|
||||
/// These bounds are inclusive.
|
||||
#[derive(Debug)]
|
||||
pub struct ProtocolRange {
|
||||
pub min: ProtocolVersion,
|
||||
pub max: ProtocolVersion,
|
||||
}
|
||||
|
||||
// Use a custom deserialize impl to ensure that `self.min <= self.max`
|
||||
impl<'de> Deserialize<'de> for ProtocolRange {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
#[derive(Deserialize)]
|
||||
struct InnerProtocolRange {
|
||||
min: ProtocolVersion,
|
||||
max: ProtocolVersion,
|
||||
}
|
||||
let InnerProtocolRange { min, max } = InnerProtocolRange::deserialize(deserializer)?;
|
||||
if min > max {
|
||||
Err(D::Error::custom(format!(
|
||||
"min version = {min} is greater than max version = {max}",
|
||||
)))
|
||||
} else {
|
||||
Ok(ProtocolRange { min, max })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ProtocolRange {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if self.min == self.max {
|
||||
f.write_fmt(format_args!("{}", self.max))
|
||||
} else {
|
||||
f.write_fmt(format_args!("{} to {}", self.min, self.max))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ProtocolRange {
|
||||
/// Find the highest shared version between two `ProtocolRange`'s
|
||||
pub fn highest_shared_version(&self, other: &Self) -> anyhow::Result<ProtocolVersion> {
|
||||
// We first have to make sure the ranges are overlapping. Once we know
|
||||
// this, we can merge the ranges by taking the max of the mins and the
|
||||
// mins of the maxes.
|
||||
if self.min > other.max {
|
||||
anyhow::bail!(
|
||||
"Non-overlapping bounds: other.max = {} was less than self.min = {}",
|
||||
other.max,
|
||||
self.min,
|
||||
)
|
||||
} else if self.max < other.min {
|
||||
anyhow::bail!(
|
||||
"Non-overlappinng bounds: self.max = {} was less than other.min = {}",
|
||||
self.max,
|
||||
other.min
|
||||
)
|
||||
} else {
|
||||
Ok(cmp::min(self.max, other.max))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// We send this to the monitor after negotiating which protocol to use
|
||||
#[derive(Serialize, Debug)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum ProtocolResponse {
|
||||
Error(String),
|
||||
Version(ProtocolVersion),
|
||||
}
|
||||
@@ -1,460 +0,0 @@
|
||||
//! Exposes the `Runner`, which handles messages received from agent and
|
||||
//! sends upscale requests.
|
||||
//!
|
||||
//! This is the "Monitor" part of the monitor binary and is the main entrypoint for
|
||||
//! all functionality.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::{fmt::Debug, mem};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use axum::extract::ws::{Message, WebSocket};
|
||||
use futures::StreamExt;
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
|
||||
use crate::dispatcher::Dispatcher;
|
||||
use crate::filecache::{FileCacheConfig, FileCacheState};
|
||||
use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
|
||||
use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB};
|
||||
|
||||
/// Central struct that interacts with agent, dispatcher, and cgroup to handle
|
||||
/// signals from the agent.
|
||||
#[derive(Debug)]
|
||||
pub struct Runner {
|
||||
config: Config,
|
||||
filecache: Option<FileCacheState>,
|
||||
cgroup: Option<Arc<CgroupWatcher>>,
|
||||
dispatcher: Dispatcher,
|
||||
|
||||
/// We "mint" new message ids by incrementing this counter and taking the value.
|
||||
///
|
||||
/// **Note**: This counter is always odd, so that we avoid collisions between the IDs generated
|
||||
/// by us vs the autoscaler-agent.
|
||||
counter: usize,
|
||||
|
||||
/// A signal to kill the main thread produced by `self.run()`. This is triggered
|
||||
/// when the server receives a new connection. When the thread receives the
|
||||
/// signal off this channel, it will gracefully shutdown.
|
||||
kill: broadcast::Receiver<()>,
|
||||
}
|
||||
|
||||
/// Configuration for a `Runner`
|
||||
#[derive(Debug)]
|
||||
pub struct Config {
|
||||
/// `sys_buffer_bytes` gives the estimated amount of memory, in bytes, that the kernel uses before
|
||||
/// handing out the rest to userspace. This value is the estimated difference between the
|
||||
/// *actual* physical memory and the amount reported by `grep MemTotal /proc/meminfo`.
|
||||
///
|
||||
/// For more information, refer to `man 5 proc`, which defines MemTotal as "Total usable RAM
|
||||
/// (i.e., physical RAM minus a few reserved bits and the kernel binary code)".
|
||||
///
|
||||
/// We only use `sys_buffer_bytes` when calculating the system memory from the *external* memory
|
||||
/// size, rather than the self-reported memory size, according to the kernel.
|
||||
///
|
||||
/// TODO: this field is only necessary while we still have to trust the autoscaler-agent's
|
||||
/// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
|
||||
/// should be removed once we have a better solution there.
|
||||
sys_buffer_bytes: u64,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sys_buffer_bytes: 100 * MiB,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Runner {
|
||||
/// Create a new monitor.
|
||||
#[tracing::instrument(skip_all, fields(?config, ?args))]
|
||||
pub async fn new(
|
||||
config: Config,
|
||||
args: &Args,
|
||||
ws: WebSocket,
|
||||
kill: broadcast::Receiver<()>,
|
||||
token: CancellationToken,
|
||||
) -> anyhow::Result<Runner> {
|
||||
anyhow::ensure!(
|
||||
config.sys_buffer_bytes != 0,
|
||||
"invalid monitor Config: sys_buffer_bytes cannot be 0"
|
||||
);
|
||||
|
||||
// *NOTE*: the dispatcher and cgroup manager talk through these channels
|
||||
// so make sure they each get the correct half, nothing is droppped, etc.
|
||||
let (notified_send, notified_recv) = mpsc::channel(1);
|
||||
let (requesting_send, requesting_recv) = mpsc::channel(1);
|
||||
|
||||
let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
|
||||
.await
|
||||
.context("error creating new dispatcher")?;
|
||||
|
||||
let mut state = Runner {
|
||||
config,
|
||||
filecache: None,
|
||||
cgroup: None,
|
||||
dispatcher,
|
||||
counter: 1, // NB: must be odd, see the comment about the field for more.
|
||||
kill,
|
||||
};
|
||||
|
||||
let mut file_cache_reserved_bytes = 0;
|
||||
let mem = get_total_system_memory();
|
||||
|
||||
// We need to process file cache initialization before cgroup initialization, so that the memory
|
||||
// allocated to the file cache is appropriately taken into account when we decide the cgroup's
|
||||
// memory limits.
|
||||
if let Some(connstr) = &args.pgconnstr {
|
||||
info!("initializing file cache");
|
||||
let config = match args.file_cache_on_disk {
|
||||
true => FileCacheConfig::default_on_disk(),
|
||||
false => FileCacheConfig::default_in_memory(),
|
||||
};
|
||||
|
||||
let mut file_cache = FileCacheState::new(connstr, config, token.clone())
|
||||
.await
|
||||
.context("failed to create file cache")?;
|
||||
|
||||
let size = file_cache
|
||||
.get_file_cache_size()
|
||||
.await
|
||||
.context("error getting file cache size")?;
|
||||
|
||||
let new_size = file_cache.config.calculate_cache_size(mem);
|
||||
info!(
|
||||
initial = bytes_to_mebibytes(size),
|
||||
new = bytes_to_mebibytes(new_size),
|
||||
"setting initial file cache size",
|
||||
);
|
||||
|
||||
// note: even if size == new_size, we want to explicitly set it, just
|
||||
// to make sure that we have the permissions to do so
|
||||
let actual_size = file_cache
|
||||
.set_file_cache_size(new_size)
|
||||
.await
|
||||
.context("failed to set file cache size, possibly due to inadequate permissions")?;
|
||||
if actual_size != new_size {
|
||||
info!("file cache size actually got set to {actual_size}")
|
||||
}
|
||||
// Mark the resources given to the file cache as reserved, but only if it's in memory.
|
||||
if !args.file_cache_on_disk {
|
||||
file_cache_reserved_bytes = actual_size;
|
||||
}
|
||||
|
||||
state.filecache = Some(file_cache);
|
||||
}
|
||||
|
||||
if let Some(name) = &args.cgroup {
|
||||
let (mut cgroup, cgroup_event_stream) =
|
||||
CgroupWatcher::new(name.clone(), requesting_send)
|
||||
.context("failed to create cgroup manager")?;
|
||||
|
||||
let available = mem - file_cache_reserved_bytes;
|
||||
|
||||
cgroup
|
||||
.set_memory_limits(available)
|
||||
.context("failed to set cgroup memory limits")?;
|
||||
|
||||
let cgroup = Arc::new(cgroup);
|
||||
|
||||
// Some might call this . . . cgroup v2
|
||||
let cgroup_clone = Arc::clone(&cgroup);
|
||||
|
||||
spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
|
||||
cgroup_clone.watch(notified_recv, cgroup_event_stream).await
|
||||
});
|
||||
|
||||
state.cgroup = Some(cgroup);
|
||||
} else {
|
||||
// *NOTE*: We need to forget the sender so that its drop impl does not get ran.
|
||||
// This allows us to poll it in `Monitor::run` regardless of whether we
|
||||
// are managing a cgroup or not. If we don't forget it, all receives will
|
||||
// immediately return an error because the sender is droped and it will
|
||||
// claim all select! statements, effectively turning `Monitor::run` into
|
||||
// `loop { fail to receive }`.
|
||||
mem::forget(requesting_send);
|
||||
}
|
||||
|
||||
Ok(state)
|
||||
}
|
||||
|
||||
/// Attempt to downscale filecache + cgroup
|
||||
#[tracing::instrument(skip_all, fields(?target))]
|
||||
pub async fn try_downscale(&mut self, target: Resources) -> anyhow::Result<(bool, String)> {
|
||||
// Nothing to adjust
|
||||
if self.cgroup.is_none() && self.filecache.is_none() {
|
||||
info!("no action needed for downscale (no cgroup or file cache enabled)");
|
||||
return Ok((
|
||||
true,
|
||||
"monitor is not managing cgroup or file cache".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let requested_mem = target.mem;
|
||||
let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
|
||||
let expected_file_cache_mem_usage = self
|
||||
.filecache
|
||||
.as_ref()
|
||||
.map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
|
||||
.unwrap_or(0);
|
||||
let mut new_cgroup_mem_high = 0;
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
new_cgroup_mem_high = cgroup
|
||||
.config
|
||||
.calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
|
||||
|
||||
let current = cgroup
|
||||
.current_memory_usage()
|
||||
.context("failed to fetch cgroup memory")?;
|
||||
|
||||
if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
|
||||
let status = format!(
|
||||
"{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
|
||||
"calculated memory.high too low",
|
||||
bytes_to_mebibytes(new_cgroup_mem_high),
|
||||
bytes_to_mebibytes(current),
|
||||
bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
|
||||
);
|
||||
|
||||
info!(status, "discontinuing downscale");
|
||||
|
||||
return Ok((false, status));
|
||||
}
|
||||
}
|
||||
|
||||
// The downscaling has been approved. Downscale the file cache, then the cgroup.
|
||||
let mut status = vec![];
|
||||
let mut file_cache_mem_usage = 0;
|
||||
if let Some(file_cache) = &mut self.filecache {
|
||||
let actual_usage = file_cache
|
||||
.set_file_cache_size(expected_file_cache_mem_usage)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
if file_cache.config.in_memory {
|
||||
file_cache_mem_usage = actual_usage;
|
||||
}
|
||||
let message = format!(
|
||||
"set file cache size to {} MiB (in memory = {})",
|
||||
bytes_to_mebibytes(actual_usage),
|
||||
file_cache.config.in_memory,
|
||||
);
|
||||
info!("downscale: {message}");
|
||||
status.push(message);
|
||||
}
|
||||
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let available_memory = usable_system_memory - file_cache_mem_usage;
|
||||
|
||||
if file_cache_mem_usage != expected_file_cache_mem_usage {
|
||||
new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
|
||||
}
|
||||
|
||||
let limits = MemoryLimits::new(
|
||||
// new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
|
||||
// since it is properly initialized in the previous cgroup if let block
|
||||
new_cgroup_mem_high,
|
||||
available_memory,
|
||||
);
|
||||
cgroup
|
||||
.set_limits(&limits)
|
||||
.context("failed to set cgroup memory limits")?;
|
||||
|
||||
let message = format!(
|
||||
"set cgroup memory.high to {} MiB, of new max {} MiB",
|
||||
bytes_to_mebibytes(new_cgroup_mem_high),
|
||||
bytes_to_mebibytes(available_memory)
|
||||
);
|
||||
info!("downscale: {message}");
|
||||
status.push(message);
|
||||
}
|
||||
|
||||
// TODO: make this status thing less jank
|
||||
let status = status.join("; ");
|
||||
Ok((true, status))
|
||||
}
|
||||
|
||||
/// Handle new resources
|
||||
#[tracing::instrument(skip_all, fields(?resources))]
|
||||
pub async fn handle_upscale(&mut self, resources: Resources) -> anyhow::Result<()> {
|
||||
if self.filecache.is_none() && self.cgroup.is_none() {
|
||||
info!("no action needed for upscale (no cgroup or file cache enabled)");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let new_mem = resources.mem;
|
||||
let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);
|
||||
|
||||
// Get the file cache's expected contribution to the memory usage
|
||||
let mut file_cache_mem_usage = 0;
|
||||
if let Some(file_cache) = &mut self.filecache {
|
||||
let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
|
||||
info!(
|
||||
target = bytes_to_mebibytes(expected_usage),
|
||||
total = bytes_to_mebibytes(new_mem),
|
||||
"updating file cache size",
|
||||
);
|
||||
|
||||
let actual_usage = file_cache
|
||||
.set_file_cache_size(expected_usage)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
if file_cache.config.in_memory {
|
||||
file_cache_mem_usage = actual_usage;
|
||||
}
|
||||
|
||||
if actual_usage != expected_usage {
|
||||
warn!(
|
||||
"file cache was set to a different size that we wanted: target = {} Mib, actual= {} Mib",
|
||||
bytes_to_mebibytes(expected_usage),
|
||||
bytes_to_mebibytes(actual_usage)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let available_memory = usable_system_memory - file_cache_mem_usage;
|
||||
let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
|
||||
info!(
|
||||
target = bytes_to_mebibytes(new_cgroup_mem_high),
|
||||
total = bytes_to_mebibytes(new_mem),
|
||||
name = cgroup.path(),
|
||||
"updating cgroup memory.high",
|
||||
);
|
||||
let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory);
|
||||
cgroup
|
||||
.set_limits(&limits)
|
||||
.context("failed to set file cache size")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Take in a message and perform some action, such as downscaling or upscaling,
|
||||
/// and return a message to be send back.
|
||||
#[tracing::instrument(skip_all, fields(%id, message = ?inner))]
|
||||
pub async fn process_message(
|
||||
&mut self,
|
||||
InboundMsg { inner, id }: InboundMsg,
|
||||
) -> anyhow::Result<Option<OutboundMsg>> {
|
||||
match inner {
|
||||
InboundMsgKind::UpscaleNotification { granted } => {
|
||||
self.handle_upscale(granted)
|
||||
.await
|
||||
.context("failed to handle upscale")?;
|
||||
self.dispatcher
|
||||
.notify_upscale(Sequenced::new(granted))
|
||||
.await
|
||||
.context("failed to notify notify cgroup of upscale")?;
|
||||
Ok(Some(OutboundMsg::new(
|
||||
OutboundMsgKind::UpscaleConfirmation {},
|
||||
id,
|
||||
)))
|
||||
}
|
||||
InboundMsgKind::DownscaleRequest { target } => self
|
||||
.try_downscale(target)
|
||||
.await
|
||||
.context("failed to downscale")
|
||||
.map(|(ok, status)| {
|
||||
Some(OutboundMsg::new(
|
||||
OutboundMsgKind::DownscaleResult { ok, status },
|
||||
id,
|
||||
))
|
||||
}),
|
||||
InboundMsgKind::InvalidMessage { error } => {
|
||||
warn!(
|
||||
%error, id, "received notification of an invalid message we sent"
|
||||
);
|
||||
Ok(None)
|
||||
}
|
||||
InboundMsgKind::InternalError { error } => {
|
||||
warn!(error, id, "agent experienced an internal error");
|
||||
Ok(None)
|
||||
}
|
||||
InboundMsgKind::HealthCheck {} => {
|
||||
Ok(Some(OutboundMsg::new(OutboundMsgKind::HealthCheck {}, id)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: don't propagate errors, probably just warn!?
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn run(&mut self) -> anyhow::Result<()> {
|
||||
info!("starting dispatcher");
|
||||
loop {
|
||||
tokio::select! {
|
||||
signal = self.kill.recv() => {
|
||||
match signal {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(e) => bail!("failed to receive kill signal: {e}")
|
||||
}
|
||||
}
|
||||
// we need to propagate an upscale request
|
||||
request = self.dispatcher.request_upscale_events.recv() => {
|
||||
if request.is_none() {
|
||||
bail!("failed to listen for upscale event from cgroup")
|
||||
}
|
||||
info!("cgroup asking for upscale; forwarding request");
|
||||
self.counter += 2; // Increment, preserving parity (i.e. keep the
|
||||
// counter odd). See the field comment for more.
|
||||
self.dispatcher
|
||||
.send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
|
||||
.await
|
||||
.context("failed to send message")?;
|
||||
}
|
||||
// there is a message from the agent
|
||||
msg = self.dispatcher.source.next() => {
|
||||
if let Some(msg) = msg {
|
||||
// Don't use 'message' as a key as the string also uses
|
||||
// that for its key
|
||||
info!(?msg, "received message");
|
||||
match msg {
|
||||
Ok(msg) => {
|
||||
let message: InboundMsg = match msg {
|
||||
Message::Text(text) => {
|
||||
serde_json::from_str(&text).context("failed to deserialize text message")?
|
||||
}
|
||||
other => {
|
||||
warn!(
|
||||
// Don't use 'message' as a key as the
|
||||
// string also uses that for its key
|
||||
msg = ?other,
|
||||
"agent should only send text messages but received different type"
|
||||
);
|
||||
continue
|
||||
},
|
||||
};
|
||||
|
||||
let out = match self.process_message(message.clone()).await {
|
||||
Ok(Some(out)) => out,
|
||||
Ok(None) => continue,
|
||||
Err(e) => {
|
||||
let error = e.to_string();
|
||||
warn!(?error, "error handling message");
|
||||
OutboundMsg::new(
|
||||
OutboundMsgKind::InternalError {
|
||||
error
|
||||
},
|
||||
message.id
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
self.dispatcher
|
||||
.send(out)
|
||||
.await
|
||||
.context("failed to send message")?;
|
||||
}
|
||||
Err(e) => warn!("{e}"),
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("dispatcher connection closed")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -51,7 +51,6 @@ serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
serde_with.workspace = true
|
||||
signal-hook.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
svg_fmt.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::{fs, path::Path, str};
|
||||
|
||||
use pageserver::page_cache::PAGE_SZ;
|
||||
use pageserver::repository::{Key, KEY_SIZE};
|
||||
use pageserver::tenant::block_io::FileBlockReader;
|
||||
use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
|
||||
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
|
||||
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
|
||||
use pageserver::tenant::storage_layer::range_overlaps;
|
||||
@@ -97,7 +97,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
|
||||
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
|
||||
async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
let file = FileBlockReader::new(VirtualFile::open(path)?);
|
||||
let summary_blk = file.read_blk(0).await?;
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
actual_summary.index_start_blk,
|
||||
|
||||
@@ -44,11 +44,13 @@ pub(crate) enum LayerCmd {
|
||||
}
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
use pageserver::tenant::block_io::BlockReader;
|
||||
|
||||
let path = path.as_ref();
|
||||
virtual_file::init(10);
|
||||
page_cache::init(100);
|
||||
let file = FileBlockReader::new(VirtualFile::open(path)?);
|
||||
let summary_blk = file.read_blk(0).await?;
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
actual_summary.index_start_blk,
|
||||
@@ -68,7 +70,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let cursor = BlockCursor::new_fileblockreader_virtual(&file);
|
||||
let cursor = BlockCursor::new(&file);
|
||||
for (k, v) in all {
|
||||
let value = cursor.read_blob(v.pos()).await?;
|
||||
println!("key:{} value_len:{}", k, value.len());
|
||||
|
||||
@@ -2,14 +2,12 @@
|
||||
|
||||
use std::env::{var, VarError};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{env, ops::ControlFlow, path::Path, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
|
||||
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
|
||||
use pageserver::deletion_queue::{DeletionQueue, DeletionQueueError};
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
@@ -351,35 +349,6 @@ fn start_pageserver(
|
||||
// Set up remote storage client
|
||||
let remote_storage = create_remote_storage_client(conf)?;
|
||||
|
||||
// Set up deletion queue
|
||||
let deletion_queue_cancel = tokio_util::sync::CancellationToken::new();
|
||||
let (deletion_queue, deletion_frontend, deletion_backend, deletion_executor) =
|
||||
DeletionQueue::new(remote_storage.clone(), conf, deletion_queue_cancel.clone());
|
||||
if let Some(mut deletion_frontend) = deletion_frontend {
|
||||
BACKGROUND_RUNTIME.spawn(async move {
|
||||
deletion_frontend
|
||||
.background()
|
||||
.instrument(info_span!(parent:None, "deletion frontend"))
|
||||
.await
|
||||
});
|
||||
}
|
||||
if let Some(mut deletion_backend) = deletion_backend {
|
||||
BACKGROUND_RUNTIME.spawn(async move {
|
||||
deletion_backend
|
||||
.background()
|
||||
.instrument(info_span!(parent: None, "deletion backend"))
|
||||
.await
|
||||
});
|
||||
}
|
||||
if let Some(mut deletion_executor) = deletion_executor {
|
||||
BACKGROUND_RUNTIME.spawn(async move {
|
||||
deletion_executor
|
||||
.background()
|
||||
.instrument(info_span!(parent: None, "deletion executor"))
|
||||
.await
|
||||
});
|
||||
}
|
||||
|
||||
// Up to this point no significant I/O has been done: this should have been fast. Record
|
||||
// duration prior to starting I/O intensive phase of startup.
|
||||
startup_checkpoint("initial", "Starting loading tenants");
|
||||
@@ -417,7 +386,6 @@ fn start_pageserver(
|
||||
TenantSharedResources {
|
||||
broker_client: broker_client.clone(),
|
||||
remote_storage: remote_storage.clone(),
|
||||
deletion_queue_client: deletion_queue.new_client(),
|
||||
},
|
||||
order,
|
||||
))?;
|
||||
@@ -514,7 +482,6 @@ fn start_pageserver(
|
||||
http_auth,
|
||||
broker_client.clone(),
|
||||
remote_storage,
|
||||
deletion_queue.clone(),
|
||||
disk_usage_eviction_state,
|
||||
)?
|
||||
.build()
|
||||
@@ -637,36 +604,6 @@ fn start_pageserver(
|
||||
// The plan is to change that over time.
|
||||
shutdown_pageserver.take();
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
|
||||
|
||||
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
||||
let dq = deletion_queue.clone();
|
||||
BACKGROUND_RUNTIME.block_on(async move {
|
||||
match tokio::time::timeout(Duration::from_secs(5), dq.new_client().flush()).await {
|
||||
Ok(flush_r) => {
|
||||
match flush_r {
|
||||
Ok(()) => {
|
||||
info!("Deletion queue flushed successfully on shutdown")
|
||||
}
|
||||
Err(e) => {
|
||||
match e {
|
||||
DeletionQueueError::ShuttingDown => {
|
||||
// This is not harmful for correctness, but is unexpected: the deletion
|
||||
// queue's workers should stay alive as long as there are any client handles instantiated.
|
||||
warn!("Deletion queue stopped prematurely");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Timed out flushing deletion queue on shutdown ({e})")
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Clean shutdown of deletion queue workers
|
||||
deletion_queue_cancel.cancel();
|
||||
|
||||
unreachable!()
|
||||
}
|
||||
})
|
||||
|
||||
@@ -204,8 +204,6 @@ pub struct PageServerConf {
|
||||
/// has it's initial logical size calculated. Not running background tasks for some seconds is
|
||||
/// not terrible.
|
||||
pub background_task_maximum_delay: Duration,
|
||||
|
||||
pub control_plane_api: Option<Url>,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -280,8 +278,6 @@ struct PageServerConfigBuilder {
|
||||
ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
|
||||
|
||||
background_task_maximum_delay: BuilderValue<Duration>,
|
||||
|
||||
control_plane_api: BuilderValue<Option<Url>>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -344,8 +340,6 @@ impl Default for PageServerConfigBuilder {
|
||||
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
|
||||
)
|
||||
.unwrap()),
|
||||
|
||||
control_plane_api: Set(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -474,10 +468,6 @@ impl PageServerConfigBuilder {
|
||||
self.background_task_maximum_delay = BuilderValue::Set(delay);
|
||||
}
|
||||
|
||||
pub fn control_plane_api(&mut self, api: Url) {
|
||||
self.control_plane_api = BuilderValue::Set(Some(api))
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_size_logical_size_queries = self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
@@ -563,9 +553,6 @@ impl PageServerConfigBuilder {
|
||||
background_task_maximum_delay: self
|
||||
.background_task_maximum_delay
|
||||
.ok_or(anyhow!("missing background_task_maximum_delay"))?,
|
||||
control_plane_api: self
|
||||
.control_plane_api
|
||||
.ok_or(anyhow!("missing control_plane_api"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -579,27 +566,6 @@ impl PageServerConf {
|
||||
self.workdir.join("tenants")
|
||||
}
|
||||
|
||||
pub fn deletion_prefix(&self) -> PathBuf {
|
||||
self.workdir.join("deletion")
|
||||
}
|
||||
|
||||
pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
|
||||
// Encode a version in the filename, so that if we ever switch away from JSON we can
|
||||
// increment this.
|
||||
const VERSION: u8 = 1;
|
||||
|
||||
self.deletion_prefix()
|
||||
.join(format!("{sequence:016x}-{VERSION:02x}.list"))
|
||||
}
|
||||
|
||||
pub fn deletion_header_path(&self) -> PathBuf {
|
||||
// Encode a version in the filename, so that if we ever switch away from JSON we can
|
||||
// increment this.
|
||||
const VERSION: u8 = 1;
|
||||
|
||||
self.deletion_prefix().join(format!("header-{VERSION:02x}"))
|
||||
}
|
||||
|
||||
pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
self.tenants_path().join(tenant_id.to_string())
|
||||
}
|
||||
@@ -677,6 +643,23 @@ impl PageServerConf {
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Files on the remote storage are stored with paths, relative to the workdir.
|
||||
/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
|
||||
///
|
||||
/// Errors if the path provided does not start from pageserver's workdir.
|
||||
pub fn remote_path(&self, local_path: &Path) -> anyhow::Result<RemotePath> {
|
||||
local_path
|
||||
.strip_prefix(&self.workdir)
|
||||
.context("Failed to strip workdir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to resolve remote part of path {:?} for base {:?}",
|
||||
local_path, self.workdir
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Turns storage remote path of a file into its local path.
|
||||
pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
|
||||
remote_path.with_base(&self.workdir)
|
||||
@@ -775,7 +758,6 @@ impl PageServerConf {
|
||||
},
|
||||
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
|
||||
"background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
|
||||
"control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -944,7 +926,6 @@ impl PageServerConf {
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
background_task_maximum_delay: Duration::ZERO,
|
||||
control_plane_api: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1168,7 +1149,6 @@ background_task_maximum_delay = '334 s'
|
||||
background_task_maximum_delay: humantime::parse_duration(
|
||||
defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
|
||||
)?,
|
||||
control_plane_api: None
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1224,7 +1204,6 @@ background_task_maximum_delay = '334 s'
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
background_task_maximum_delay: Duration::from_secs(334),
|
||||
control_plane_api: None
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -1,850 +0,0 @@
|
||||
mod backend;
|
||||
mod executor;
|
||||
mod frontend;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::metrics::DELETION_QUEUE_SUBMITTED;
|
||||
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use serde_with::serde_as;
|
||||
use thiserror::Error;
|
||||
use tokio;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{self, debug, error};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
pub(crate) use self::backend::BackendQueueWorker;
|
||||
use self::executor::ExecutorWorker;
|
||||
use self::frontend::DeletionOp;
|
||||
pub(crate) use self::frontend::FrontendQueueWorker;
|
||||
use backend::BackendQueueMessage;
|
||||
use executor::ExecutorMessage;
|
||||
use frontend::FrontendQueueMessage;
|
||||
|
||||
use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
|
||||
|
||||
// TODO: adminstrative "panic button" config property to disable all deletions
|
||||
// TODO: configurable for how long to wait before executing deletions
|
||||
|
||||
/// We aggregate object deletions from many tenants in one place, for several reasons:
|
||||
/// - Coalesce deletions into fewer DeleteObjects calls
|
||||
/// - Enable Tenant/Timeline lifetimes to be shorter than the time it takes
|
||||
/// to flush any outstanding deletions.
|
||||
/// - Globally control throughput of deletions, as these are a low priority task: do
|
||||
/// not compete with the same S3 clients/connections used for higher priority uploads.
|
||||
/// - Future: enable validating that we may do deletions in a multi-attached scenario,
|
||||
/// via generation numbers (see https://github.com/neondatabase/neon/pull/4919)
|
||||
///
|
||||
/// There are two kinds of deletion: deferred and immediate. A deferred deletion
|
||||
/// may be intentionally delayed to protect passive readers of S3 data, and may
|
||||
/// be subject to a generation number validation step. An immediate deletion is
|
||||
/// ready to execute immediately, and is only queued up so that it can be coalesced
|
||||
/// with other deletions in flight.
|
||||
///
|
||||
/// Deferred deletions pass through three steps:
|
||||
/// - Frontend: accumulate deletion requests from Timelines, and batch them up into
|
||||
/// DeletionLists, which are persisted to S3.
|
||||
/// - Backend: accumulate deletion lists, and validate them en-masse prior to passing
|
||||
/// the keys in the list onward for actual deletion
|
||||
/// - Executor: accumulate object keys that the backend has validated for immediate
|
||||
/// deletion, and execute them in batches of 1000 keys via DeleteObjects.
|
||||
///
|
||||
/// Non-deferred deletions, such as during timeline deletion, bypass the first
|
||||
/// two stages and are passed straight into the Executor.
|
||||
///
|
||||
/// Internally, each stage is joined by a channel to the next. In S3, there is only
|
||||
/// one queue (of DeletionLists), which is written by the frontend and consumed
|
||||
/// by the backend.
|
||||
#[derive(Clone)]
|
||||
pub struct DeletionQueue {
|
||||
client: DeletionQueueClient,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FlushOp {
|
||||
tx: tokio::sync::oneshot::Sender<()>,
|
||||
}
|
||||
|
||||
impl FlushOp {
|
||||
fn fire(self) {
|
||||
if self.tx.send(()).is_err() {
|
||||
// oneshot channel closed. This is legal: a client could be destroyed while waiting for a flush.
|
||||
debug!("deletion queue flush from dropped client");
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct DeletionQueueClient {
|
||||
tx: tokio::sync::mpsc::Sender<FrontendQueueMessage>,
|
||||
executor_tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct TenantDeletionList {
|
||||
/// For each Timeline, a list of key fragments to append to the timeline remote path
|
||||
/// when reconstructing a full key
|
||||
timelines: HashMap<TimelineId, Vec<String>>,
|
||||
|
||||
/// The generation in which this deletion was emitted: note that this may not be the
|
||||
/// same as the generation of any layers being deleted. The generation of the layer
|
||||
/// has already been absorbed into the keys in `objects`
|
||||
generation: Generation,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct DeletionList {
|
||||
/// Serialization version, for future use
|
||||
version: u8,
|
||||
|
||||
/// Used for constructing a unique key for each deletion list we write out.
|
||||
sequence: u64,
|
||||
|
||||
/// To avoid repeating tenant/timeline IDs in every key, we store keys in
|
||||
/// nested HashMaps by TenantTimelineID. Each Tenant only appears once
|
||||
/// with one unique generation ID: if someone tries to push a second generation
|
||||
/// ID for the same tenant, we will start a new DeletionList.
|
||||
tenants: HashMap<TenantId, TenantDeletionList>,
|
||||
|
||||
/// Avoid having to walk `tenants` to calculate size
|
||||
size: usize,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct DeletionHeader {
|
||||
/// Serialization version, for future use
|
||||
version: u8,
|
||||
|
||||
/// Enable determining the next sequence number even if there are no deletion lists present.
|
||||
/// If there _are_ deletion lists present, then their sequence numbers take precedence over
|
||||
/// this.
|
||||
last_deleted_list_seq: u64,
|
||||
// TODO: this is where we will track a 'clean' sequence number that indicates all deletion
|
||||
// lists <= that sequence have had their generations validated with the control plane
|
||||
// and are OK to execute.
|
||||
}
|
||||
|
||||
impl DeletionHeader {
|
||||
const VERSION_LATEST: u8 = 1;
|
||||
|
||||
fn new(last_deleted_list_seq: u64) -> Self {
|
||||
Self {
|
||||
version: Self::VERSION_LATEST,
|
||||
last_deleted_list_seq,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DeletionList {
|
||||
const VERSION_LATEST: u8 = 1;
|
||||
fn new(sequence: u64) -> Self {
|
||||
Self {
|
||||
version: Self::VERSION_LATEST,
|
||||
sequence,
|
||||
tenants: HashMap::new(),
|
||||
size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn drain(&mut self) -> Self {
|
||||
let mut tenants = HashMap::new();
|
||||
std::mem::swap(&mut self.tenants, &mut tenants);
|
||||
let other = Self {
|
||||
version: Self::VERSION_LATEST,
|
||||
sequence: self.sequence,
|
||||
tenants,
|
||||
size: self.size,
|
||||
};
|
||||
self.size = 0;
|
||||
other
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.tenants.is_empty()
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.size
|
||||
}
|
||||
|
||||
/// Returns true if the push was accepted, false if the caller must start a new
|
||||
/// deletion list.
|
||||
fn push(
|
||||
&mut self,
|
||||
tenant: &TenantId,
|
||||
timeline: &TimelineId,
|
||||
generation: Generation,
|
||||
objects: &mut Vec<RemotePath>,
|
||||
) -> bool {
|
||||
if objects.is_empty() {
|
||||
// Avoid inserting an empty TimelineDeletionList: this preserves the property
|
||||
// that if we have no keys, then self.objects is empty (used in Self::is_empty)
|
||||
return true;
|
||||
}
|
||||
|
||||
let tenant_entry = self
|
||||
.tenants
|
||||
.entry(*tenant)
|
||||
.or_insert_with(|| TenantDeletionList {
|
||||
timelines: HashMap::new(),
|
||||
generation: generation,
|
||||
});
|
||||
|
||||
if tenant_entry.generation != generation {
|
||||
// Only one generation per tenant per list: signal to
|
||||
// caller to start a new list.
|
||||
return false;
|
||||
}
|
||||
|
||||
let timeline_entry = tenant_entry
|
||||
.timelines
|
||||
.entry(*timeline)
|
||||
.or_insert_with(|| Vec::new());
|
||||
|
||||
let timeline_remote_path = remote_timeline_path(tenant, timeline);
|
||||
|
||||
self.size += objects.len();
|
||||
timeline_entry.extend(objects.drain(..).map(|p| {
|
||||
p.strip_prefix(&timeline_remote_path)
|
||||
.expect("Timeline paths always start with the timeline prefix")
|
||||
.to_string_lossy()
|
||||
.to_string()
|
||||
}));
|
||||
true
|
||||
}
|
||||
|
||||
fn take_paths(self) -> Vec<RemotePath> {
|
||||
let mut result = Vec::new();
|
||||
for (tenant, tenant_deletions) in self.tenants.into_iter() {
|
||||
for (timeline, timeline_layers) in tenant_deletions.timelines.into_iter() {
|
||||
let timeline_remote_path = remote_timeline_path(&tenant, &timeline);
|
||||
result.extend(
|
||||
timeline_layers
|
||||
.into_iter()
|
||||
.map(|l| timeline_remote_path.join(&PathBuf::from(l))),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum DeletionQueueError {
|
||||
#[error("Deletion queue unavailable during shutdown")]
|
||||
ShuttingDown,
|
||||
}
|
||||
|
||||
impl DeletionQueueClient {
|
||||
async fn do_push(&self, msg: FrontendQueueMessage) -> Result<(), DeletionQueueError> {
|
||||
match self.tx.send(msg).await {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => {
|
||||
// This shouldn't happen, we should shut down all tenants before
|
||||
// we shut down the global delete queue. If we encounter a bug like this,
|
||||
// we may leak objects as deletions won't be processed.
|
||||
error!("Deletion queue closed while pushing, shutting down? ({e})");
|
||||
Err(DeletionQueueError::ShuttingDown)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Submit a list of layers for deletion: this function will return before the deletion is
|
||||
/// persistent, but it may be executed at any time after this function enters: do not push
|
||||
/// layers until you're sure they can be deleted safely (i.e. remote metadata no longer
|
||||
/// references them).
|
||||
pub(crate) async fn push_layers(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
generation: Generation,
|
||||
layers: Vec<(LayerFileName, Generation)>,
|
||||
) -> Result<(), DeletionQueueError> {
|
||||
DELETION_QUEUE_SUBMITTED.inc_by(layers.len() as u64);
|
||||
self.do_push(FrontendQueueMessage::Delete(DeletionOp {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
layers,
|
||||
generation,
|
||||
objects: Vec::new(),
|
||||
}))
|
||||
.await
|
||||
}
|
||||
|
||||
async fn do_flush(
|
||||
&self,
|
||||
msg: FrontendQueueMessage,
|
||||
rx: tokio::sync::oneshot::Receiver<()>,
|
||||
) -> Result<(), DeletionQueueError> {
|
||||
self.do_push(msg).await?;
|
||||
if rx.await.is_err() {
|
||||
// This shouldn't happen if tenants are shut down before deletion queue. If we
|
||||
// encounter a bug like this, then a flusher will incorrectly believe it has flushed
|
||||
// when it hasn't, possibly leading to leaking objects.
|
||||
error!("Deletion queue dropped flush op while client was still waiting");
|
||||
Err(DeletionQueueError::ShuttingDown)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Wait until all previous deletions are persistent (either executed, or written to a DeletionList)
|
||||
pub async fn flush(&self) -> Result<(), DeletionQueueError> {
|
||||
let (tx, rx) = tokio::sync::oneshot::channel::<()>();
|
||||
self.do_flush(FrontendQueueMessage::Flush(FlushOp { tx }), rx)
|
||||
.await
|
||||
}
|
||||
|
||||
// Wait until all previous deletions are executed
|
||||
pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
|
||||
debug!("flush_execute: flushing to deletion lists...");
|
||||
// Flush any buffered work to deletion lists
|
||||
self.flush().await?;
|
||||
|
||||
// Flush execution of deletion lists
|
||||
let (tx, rx) = tokio::sync::oneshot::channel::<()>();
|
||||
debug!("flush_execute: flushing execution...");
|
||||
self.do_flush(FrontendQueueMessage::FlushExecute(FlushOp { tx }), rx)
|
||||
.await?;
|
||||
debug!("flush_execute: finished flushing execution...");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This interface bypasses the persistent deletion queue, and any validation
|
||||
/// that this pageserver is still elegible to execute the deletions. It is for
|
||||
/// use in timeline deletions, where the control plane is telling us we may
|
||||
/// delete everything in the timeline.
|
||||
///
|
||||
/// DO NOT USE THIS FROM GC OR COMPACTION CODE. Use the regular `push_layers`.
|
||||
pub(crate) async fn push_immediate(
|
||||
&self,
|
||||
objects: Vec<RemotePath>,
|
||||
) -> Result<(), DeletionQueueError> {
|
||||
self.executor_tx
|
||||
.send(ExecutorMessage::Delete(objects))
|
||||
.await
|
||||
.map_err(|_| DeletionQueueError::ShuttingDown)
|
||||
}
|
||||
|
||||
/// Companion to push_immediate. When this returns Ok, all prior objects sent
|
||||
/// into push_immediate have been deleted from remote storage.
|
||||
pub(crate) async fn flush_immediate(&self) -> Result<(), DeletionQueueError> {
|
||||
let (tx, rx) = tokio::sync::oneshot::channel::<()>();
|
||||
self.executor_tx
|
||||
.send(ExecutorMessage::Flush(FlushOp { tx }))
|
||||
.await
|
||||
.map_err(|_| DeletionQueueError::ShuttingDown)?;
|
||||
|
||||
rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
|
||||
}
|
||||
}
|
||||
|
||||
impl DeletionQueue {
|
||||
pub fn new_client(&self) -> DeletionQueueClient {
|
||||
self.client.clone()
|
||||
}
|
||||
|
||||
/// Caller may use the returned object to construct clients with new_client.
|
||||
/// Caller should tokio::spawn the background() members of the two worker objects returned:
|
||||
/// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
|
||||
///
|
||||
/// If remote_storage is None, then the returned workers will also be None.
|
||||
pub fn new(
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
conf: &'static PageServerConf,
|
||||
cancel: CancellationToken,
|
||||
) -> (
|
||||
Self,
|
||||
Option<FrontendQueueWorker>,
|
||||
Option<BackendQueueWorker>,
|
||||
Option<ExecutorWorker>,
|
||||
) {
|
||||
// Deep channel: it consumes deletions from all timelines and we do not want to block them
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(16384);
|
||||
|
||||
// Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
|
||||
let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
|
||||
|
||||
// Shallow channel: it carries lists of paths, and we expect the main queueing to
|
||||
// happen in the backend (persistent), not in this queue.
|
||||
let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16);
|
||||
|
||||
let remote_storage = match remote_storage {
|
||||
None => {
|
||||
return (
|
||||
Self {
|
||||
client: DeletionQueueClient { tx, executor_tx },
|
||||
},
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
}
|
||||
Some(r) => r,
|
||||
};
|
||||
|
||||
(
|
||||
Self {
|
||||
client: DeletionQueueClient {
|
||||
tx,
|
||||
executor_tx: executor_tx.clone(),
|
||||
},
|
||||
},
|
||||
Some(FrontendQueueWorker::new(
|
||||
conf,
|
||||
rx,
|
||||
backend_tx,
|
||||
cancel.clone(),
|
||||
)),
|
||||
Some(BackendQueueWorker::new(
|
||||
conf,
|
||||
backend_rx,
|
||||
executor_tx,
|
||||
cancel.clone(),
|
||||
)),
|
||||
Some(ExecutorWorker::new(
|
||||
remote_storage,
|
||||
executor_rx,
|
||||
cancel.clone(),
|
||||
)),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use hex_literal::hex;
|
||||
use std::{
|
||||
io::ErrorKind,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
use tracing::info;
|
||||
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
use tokio::{runtime::EnterGuard, task::JoinHandle};
|
||||
|
||||
use crate::tenant::{harness::TenantHarness, remote_timeline_client::remote_timeline_path};
|
||||
|
||||
use super::*;
|
||||
pub const TIMELINE_ID: TimelineId =
|
||||
TimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
|
||||
struct TestSetup {
|
||||
runtime: &'static tokio::runtime::Runtime,
|
||||
_entered_runtime: EnterGuard<'static>,
|
||||
harness: TenantHarness,
|
||||
remote_fs_dir: PathBuf,
|
||||
storage: GenericRemoteStorage,
|
||||
deletion_queue: DeletionQueue,
|
||||
fe_worker: JoinHandle<()>,
|
||||
be_worker: JoinHandle<()>,
|
||||
ex_worker: JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl TestSetup {
|
||||
/// Simulate a pageserver restart by destroying and recreating the deletion queue
|
||||
fn restart(&mut self) {
|
||||
let (deletion_queue, fe_worker, be_worker, ex_worker) = DeletionQueue::new(
|
||||
Some(self.storage.clone()),
|
||||
self.harness.conf,
|
||||
CancellationToken::new(),
|
||||
);
|
||||
|
||||
self.deletion_queue = deletion_queue;
|
||||
|
||||
let mut fe_worker = fe_worker.unwrap();
|
||||
let mut be_worker = be_worker.unwrap();
|
||||
let mut ex_worker = ex_worker.unwrap();
|
||||
let mut fe_worker = self
|
||||
.runtime
|
||||
.spawn(async move { fe_worker.background().await });
|
||||
let mut be_worker = self
|
||||
.runtime
|
||||
.spawn(async move { be_worker.background().await });
|
||||
let mut ex_worker = self.runtime.spawn(async move {
|
||||
drop(ex_worker.background().await);
|
||||
});
|
||||
std::mem::swap(&mut self.fe_worker, &mut fe_worker);
|
||||
std::mem::swap(&mut self.be_worker, &mut be_worker);
|
||||
std::mem::swap(&mut self.ex_worker, &mut ex_worker);
|
||||
|
||||
// Join the old workers
|
||||
self.runtime.block_on(fe_worker).unwrap();
|
||||
self.runtime.block_on(be_worker).unwrap();
|
||||
self.runtime.block_on(ex_worker).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
|
||||
let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
|
||||
let harness = TenantHarness::create(test_name)?;
|
||||
|
||||
// We do not load() the harness: we only need its config and remote_storage
|
||||
|
||||
// Set up a GenericRemoteStorage targetting a directory
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
|
||||
std::fs::create_dir_all(remote_fs_dir)?;
|
||||
let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
|
||||
let storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: std::num::NonZeroU32::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
|
||||
)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
};
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||
|
||||
let runtime = Box::leak(Box::new(
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?,
|
||||
));
|
||||
let entered_runtime = runtime.enter();
|
||||
|
||||
let (deletion_queue, fe_worker, be_worker, ex_worker) = DeletionQueue::new(
|
||||
Some(storage.clone()),
|
||||
harness.conf,
|
||||
CancellationToken::new(),
|
||||
);
|
||||
|
||||
let mut fe_worker = fe_worker.unwrap();
|
||||
let mut be_worker = be_worker.unwrap();
|
||||
let mut ex_worker = ex_worker.unwrap();
|
||||
let fe_worker_join = runtime.spawn(async move { fe_worker.background().await });
|
||||
let be_worker_join = runtime.spawn(async move { be_worker.background().await });
|
||||
let ex_worker_join = runtime.spawn(async move {
|
||||
drop(ex_worker.background().await);
|
||||
});
|
||||
|
||||
Ok(TestSetup {
|
||||
runtime,
|
||||
_entered_runtime: entered_runtime,
|
||||
harness,
|
||||
remote_fs_dir,
|
||||
storage,
|
||||
deletion_queue,
|
||||
fe_worker: fe_worker_join,
|
||||
be_worker: be_worker_join,
|
||||
ex_worker: ex_worker_join,
|
||||
})
|
||||
}
|
||||
|
||||
// TODO: put this in a common location so that we can share with remote_timeline_client's tests
|
||||
fn assert_remote_files(expected: &[&str], remote_path: &Path) {
|
||||
let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
|
||||
expected.sort();
|
||||
|
||||
let mut found: Vec<String> = Vec::new();
|
||||
let dir = match std::fs::read_dir(remote_path) {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
if e.kind() == ErrorKind::NotFound {
|
||||
if expected.is_empty() {
|
||||
// We are asserting prefix is empty: it is expected that the dir is missing
|
||||
return;
|
||||
} else {
|
||||
assert_eq!(expected, Vec::<String>::new());
|
||||
unreachable!();
|
||||
}
|
||||
} else {
|
||||
panic!(
|
||||
"Unexpected error listing {0}: {e}",
|
||||
remote_path.to_string_lossy()
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for entry in dir.flatten() {
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
found.push(String::from(fname));
|
||||
}
|
||||
found.sort();
|
||||
|
||||
assert_eq!(expected, found);
|
||||
}
|
||||
|
||||
fn assert_local_files(expected: &[&str], directory: &Path) {
|
||||
let mut dir = match std::fs::read_dir(directory) {
|
||||
Ok(d) => d,
|
||||
Err(_) => {
|
||||
assert_eq!(expected, &Vec::<String>::new());
|
||||
return;
|
||||
}
|
||||
};
|
||||
let mut found = Vec::new();
|
||||
while let Some(dentry) = dir.next() {
|
||||
let dentry = dentry.unwrap();
|
||||
let file_name = dentry.file_name();
|
||||
let file_name_str = file_name.to_string_lossy();
|
||||
found.push(file_name_str.to_string());
|
||||
}
|
||||
found.sort();
|
||||
assert_eq!(expected, found);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deletion_queue_smoke() -> anyhow::Result<()> {
|
||||
// Basic test that the deletion queue processes the deletions we pass into it
|
||||
let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
let tenant_id = ctx.harness.tenant_id;
|
||||
|
||||
let content: Vec<u8> = "victim1 contents".into();
|
||||
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
|
||||
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
||||
let deletion_prefix = ctx.harness.conf.deletion_prefix();
|
||||
|
||||
// Exercise the distinction between the generation of the layers
|
||||
// we delete, and the generation of the running Tenant.
|
||||
let layer_generation = Generation::new(0xdeadbeef);
|
||||
let now_generation = Generation::new(0xfeedbeef);
|
||||
|
||||
let remote_layer_file_name_1 =
|
||||
format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
|
||||
|
||||
// Inject a victim file to remote storage
|
||||
info!("Writing");
|
||||
std::fs::create_dir_all(&remote_timeline_path)?;
|
||||
std::fs::write(
|
||||
remote_timeline_path.join(remote_layer_file_name_1.clone()),
|
||||
content,
|
||||
)?;
|
||||
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
|
||||
|
||||
// File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
|
||||
info!("Pushing");
|
||||
ctx.runtime.block_on(client.push_layers(
|
||||
tenant_id,
|
||||
TIMELINE_ID,
|
||||
now_generation,
|
||||
[(layer_file_name_1.clone(), layer_generation)].to_vec(),
|
||||
))?;
|
||||
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
|
||||
|
||||
assert_local_files(&[], &deletion_prefix);
|
||||
|
||||
// File should still be there after we write a deletion list (we haven't pushed enough to execute anything)
|
||||
info!("Flushing");
|
||||
ctx.runtime.block_on(client.flush())?;
|
||||
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
|
||||
assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
|
||||
|
||||
// File should go away when we execute
|
||||
info!("Flush-executing");
|
||||
ctx.runtime.block_on(client.flush_execute())?;
|
||||
assert_remote_files(&[], &remote_timeline_path);
|
||||
assert_local_files(&["header-01"], &deletion_prefix);
|
||||
|
||||
// Flushing on an empty queue should succeed immediately, and not write any lists
|
||||
info!("Flush-executing on empty");
|
||||
ctx.runtime.block_on(client.flush_execute())?;
|
||||
assert_local_files(&["header-01"], &deletion_prefix);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deletion_queue_recovery() -> anyhow::Result<()> {
|
||||
// Basic test that the deletion queue processes the deletions we pass into it
|
||||
let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
let tenant_id = ctx.harness.tenant_id;
|
||||
|
||||
let content: Vec<u8> = "victim1 contents".into();
|
||||
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
|
||||
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
||||
let deletion_prefix = ctx.harness.conf.deletion_prefix();
|
||||
let layer_generation = Generation::new(0xdeadbeef);
|
||||
let now_generation = Generation::new(0xfeedbeef);
|
||||
let remote_layer_file_name_1 =
|
||||
format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
|
||||
|
||||
// Inject a file, delete it, and flush to a deletion list
|
||||
std::fs::create_dir_all(&remote_timeline_path)?;
|
||||
std::fs::write(
|
||||
remote_timeline_path.join(remote_layer_file_name_1.clone()),
|
||||
content,
|
||||
)?;
|
||||
ctx.runtime.block_on(client.push_layers(
|
||||
tenant_id,
|
||||
TIMELINE_ID,
|
||||
now_generation,
|
||||
[(layer_file_name_1.clone(), layer_generation)].to_vec(),
|
||||
))?;
|
||||
ctx.runtime.block_on(client.flush())?;
|
||||
assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
|
||||
|
||||
// Restart the deletion queue
|
||||
drop(client);
|
||||
ctx.restart();
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
|
||||
// If we have recovered the deletion list properly, then executing after restart should purge it
|
||||
info!("Flush-executing");
|
||||
ctx.runtime.block_on(client.flush_execute())?;
|
||||
assert_remote_files(&[], &remote_timeline_path);
|
||||
assert_local_files(&["header-01"], &deletion_prefix);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A lightweight queue which can issue ordinary DeletionQueueClient objects, but doesn't do any persistence
|
||||
/// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it.
|
||||
#[cfg(test)]
|
||||
pub mod mock {
|
||||
use tracing::info;
|
||||
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
|
||||
use super::*;
|
||||
use std::sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc,
|
||||
};
|
||||
|
||||
pub struct MockDeletionQueue {
|
||||
tx: tokio::sync::mpsc::Sender<FrontendQueueMessage>,
|
||||
executor_tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
|
||||
tx_pump: tokio::sync::mpsc::Sender<FlushOp>,
|
||||
executed: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
impl MockDeletionQueue {
|
||||
pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel(16384);
|
||||
let (tx_pump, mut rx_pump) = tokio::sync::mpsc::channel::<FlushOp>(1);
|
||||
let (executor_tx, mut executor_rx) = tokio::sync::mpsc::channel(16384);
|
||||
|
||||
let executed = Arc::new(AtomicUsize::new(0));
|
||||
let executed_bg = executed.clone();
|
||||
|
||||
tokio::spawn(async move {
|
||||
let remote_storage = match &remote_storage {
|
||||
Some(rs) => rs,
|
||||
None => {
|
||||
info!("No remote storage configured, deletion queue will not run");
|
||||
return;
|
||||
}
|
||||
};
|
||||
info!("Running mock deletion queue");
|
||||
// Each time we are asked to pump, drain the queue of deletions
|
||||
while let Some(flush_op) = rx_pump.recv().await {
|
||||
info!("Executing all pending deletions");
|
||||
|
||||
// Transform all executor messages to generic frontend messages
|
||||
while let Ok(msg) = executor_rx.try_recv() {
|
||||
match msg {
|
||||
ExecutorMessage::Delete(objects) => {
|
||||
for path in objects {
|
||||
match remote_storage.delete(&path).await {
|
||||
Ok(_) => {
|
||||
debug!("Deleted {path}");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to delete {path}, leaking object! ({e})"
|
||||
);
|
||||
}
|
||||
}
|
||||
executed_bg.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
ExecutorMessage::Flush(flush_op) => {
|
||||
flush_op.fire();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while let Ok(msg) = rx.try_recv() {
|
||||
match msg {
|
||||
FrontendQueueMessage::Delete(op) => {
|
||||
let mut objects = op.objects;
|
||||
for (layer, generation) in op.layers {
|
||||
objects.push(remote_layer_path(
|
||||
&op.tenant_id,
|
||||
&op.timeline_id,
|
||||
&layer,
|
||||
generation,
|
||||
));
|
||||
}
|
||||
|
||||
for path in objects {
|
||||
info!("Executing deletion {path}");
|
||||
match remote_storage.delete(&path).await {
|
||||
Ok(_) => {
|
||||
debug!("Deleted {path}");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to delete {path}, leaking object! ({e})"
|
||||
);
|
||||
}
|
||||
}
|
||||
executed_bg.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
FrontendQueueMessage::Flush(op) => {
|
||||
op.fire();
|
||||
}
|
||||
FrontendQueueMessage::FlushExecute(op) => {
|
||||
// We have already executed all prior deletions because mock does them inline
|
||||
op.fire();
|
||||
}
|
||||
}
|
||||
info!("All pending deletions have been executed");
|
||||
}
|
||||
flush_op
|
||||
.tx
|
||||
.send(())
|
||||
.expect("Test called flush but dropped before finishing");
|
||||
}
|
||||
});
|
||||
|
||||
Self {
|
||||
tx,
|
||||
tx_pump,
|
||||
executor_tx,
|
||||
executed,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_executed(&self) -> usize {
|
||||
self.executed.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub async fn pump(&self) {
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
self.tx_pump
|
||||
.send(FlushOp { tx })
|
||||
.await
|
||||
.expect("pump called after deletion queue loop stopped");
|
||||
rx.await
|
||||
.expect("Mock delete queue shutdown while waiting to pump");
|
||||
}
|
||||
|
||||
pub(crate) fn new_client(&self) -> DeletionQueueClient {
|
||||
DeletionQueueClient {
|
||||
tx: self.tx.clone(),
|
||||
executor_tx: self.executor_tx.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,300 +0,0 @@
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
|
||||
use futures::future::TryFutureExt;
|
||||
use pageserver_api::control_api::HexTenantId;
|
||||
use pageserver_api::control_api::{ValidateRequest, ValidateRequestTenant, ValidateResponse};
|
||||
use serde::de::DeserializeOwned;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
use utils::backoff;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::metrics::DELETION_QUEUE_ERRORS;
|
||||
|
||||
use super::executor::ExecutorMessage;
|
||||
use super::DeletionHeader;
|
||||
use super::DeletionList;
|
||||
use super::DeletionQueueError;
|
||||
use super::FlushOp;
|
||||
|
||||
// After this length of time, execute deletions which are elegible to run,
|
||||
// even if we haven't accumulated enough for a full-sized DeleteObjects
|
||||
const EXECUTE_IDLE_DEADLINE: Duration = Duration::from_secs(60);
|
||||
|
||||
// If we have received this number of keys, proceed with attempting to execute
|
||||
const AUTOFLUSH_KEY_COUNT: usize = 16384;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(super) enum BackendQueueMessage {
|
||||
Delete(DeletionList),
|
||||
Flush(FlushOp),
|
||||
}
|
||||
pub struct BackendQueueWorker {
|
||||
conf: &'static PageServerConf,
|
||||
rx: tokio::sync::mpsc::Receiver<BackendQueueMessage>,
|
||||
tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
|
||||
|
||||
// Accumulate some lists to execute in a batch.
|
||||
// The purpose of this accumulation is to implement batched validation of
|
||||
// attachment generations, when split-brain protection is implemented.
|
||||
// (see https://github.com/neondatabase/neon/pull/4919)
|
||||
pending_lists: Vec<DeletionList>,
|
||||
|
||||
// Sum of all the lengths of lists in pending_lists
|
||||
pending_key_count: usize,
|
||||
|
||||
// DeletionLists we have fully executed, which may be deleted
|
||||
// from remote storage.
|
||||
executed_lists: Vec<DeletionList>,
|
||||
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum ValidateCallError {
|
||||
#[error("shutdown")]
|
||||
Shutdown,
|
||||
#[error("remote: {0}")]
|
||||
Remote(reqwest::Error),
|
||||
}
|
||||
|
||||
async fn retry_http_forever<T>(
|
||||
url: &url::Url,
|
||||
request: ValidateRequest,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<T, DeletionQueueError>
|
||||
where
|
||||
T: DeserializeOwned,
|
||||
{
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client");
|
||||
|
||||
let response = match backoff::retry(
|
||||
|| {
|
||||
client
|
||||
.post(url.clone())
|
||||
.json(&request)
|
||||
.send()
|
||||
.map_err(|e| ValidateCallError::Remote(e))
|
||||
},
|
||||
|_| false,
|
||||
3,
|
||||
u32::MAX,
|
||||
"calling control plane generation validation API",
|
||||
backoff::Cancel::new(cancel.clone(), || ValidateCallError::Shutdown),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(ValidateCallError::Shutdown) => {
|
||||
return Err(DeletionQueueError::ShuttingDown);
|
||||
}
|
||||
Err(ValidateCallError::Remote(_)) => {
|
||||
panic!("We retry forever");
|
||||
}
|
||||
Ok(r) => r,
|
||||
};
|
||||
|
||||
// TODO: handle non-200 response
|
||||
// TODO: handle decode error
|
||||
Ok(response.json::<T>().await.unwrap())
|
||||
}
|
||||
|
||||
impl BackendQueueWorker {
|
||||
pub(super) fn new(
|
||||
conf: &'static PageServerConf,
|
||||
rx: tokio::sync::mpsc::Receiver<BackendQueueMessage>,
|
||||
tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
Self {
|
||||
conf,
|
||||
rx,
|
||||
tx,
|
||||
pending_lists: Vec::new(),
|
||||
pending_key_count: 0,
|
||||
executed_lists: Vec::new(),
|
||||
cancel,
|
||||
}
|
||||
}
|
||||
|
||||
async fn cleanup_lists(&mut self) {
|
||||
debug!(
|
||||
"cleanup_lists: {0} executed lists, {1} pending lists",
|
||||
self.executed_lists.len(),
|
||||
self.pending_lists.len()
|
||||
);
|
||||
|
||||
// Lists are always pushed into the queues + executed list in sequence order, so
|
||||
// no sort is required: can find the highest sequence number by peeking at last element
|
||||
let max_executed_seq = match self.executed_lists.last() {
|
||||
Some(v) => v.sequence,
|
||||
None => {
|
||||
// No executed lists, nothing to clean up.
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// In case this is the last list, write a header out first so that
|
||||
// we don't risk losing our knowledge of the sequence number (on replay, our
|
||||
// next sequence number is the highest list seen + 1, or read from the header
|
||||
// if there are no lists)
|
||||
let header = DeletionHeader::new(max_executed_seq);
|
||||
debug!("Writing header {:?}", header);
|
||||
let header_bytes =
|
||||
serde_json::to_vec(&header).expect("Failed to serialize deletion header");
|
||||
let header_path = self.conf.deletion_header_path();
|
||||
|
||||
if let Err(e) = tokio::fs::write(&header_path, header_bytes).await {
|
||||
warn!("Failed to upload deletion queue header: {e:#}");
|
||||
DELETION_QUEUE_ERRORS
|
||||
.with_label_values(&["put_header"])
|
||||
.inc();
|
||||
return;
|
||||
}
|
||||
|
||||
while let Some(list) = self.executed_lists.pop() {
|
||||
let list_path = self.conf.deletion_list_path(list.sequence);
|
||||
if let Err(e) = tokio::fs::remove_file(&list_path).await {
|
||||
// Unexpected: we should have permissions and nothing else should
|
||||
// be touching these files
|
||||
tracing::error!("Failed to delete {0}: {e:#}", list_path.display());
|
||||
self.executed_lists.push(list);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn validate_lists(&mut self) -> Result<(), DeletionQueueError> {
|
||||
let control_plane_api = match &self.conf.control_plane_api {
|
||||
None => {
|
||||
// Generations are not switched on yet.
|
||||
return Ok(());
|
||||
}
|
||||
Some(api) => api,
|
||||
};
|
||||
|
||||
let validate_path = control_plane_api
|
||||
.join("validate")
|
||||
.expect("Failed to build validate path");
|
||||
|
||||
for list in &mut self.pending_lists {
|
||||
let request = ValidateRequest {
|
||||
tenants: list
|
||||
.tenants
|
||||
.iter()
|
||||
.map(|(tid, tdl)| ValidateRequestTenant {
|
||||
id: HexTenantId::new(*tid),
|
||||
gen: tdl.generation.into().expect(
|
||||
"Generation should always be valid for a Tenant doing deletions",
|
||||
),
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
|
||||
// Retry forever, we cannot make progress until we get a response
|
||||
let response: ValidateResponse =
|
||||
retry_http_forever(&validate_path, request, self.cancel.clone()).await?;
|
||||
|
||||
let tenants_valid: HashMap<_, _> = response
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|t| (t.id.take(), t.valid))
|
||||
.collect();
|
||||
|
||||
// Filter the list based on whether the server responded valid: true.
|
||||
// If a tenant is omitted in the response, it has been deleted, and we should
|
||||
// proceed with deletion.
|
||||
list.tenants.retain(|tenant_id, _tenant| {
|
||||
let r = tenants_valid.get(tenant_id).map(|v| *v).unwrap_or(true);
|
||||
if !r {
|
||||
warn!("Dropping stale deletions for tenant {tenant_id}, objects may be leaked");
|
||||
}
|
||||
r
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn flush(&mut self) {
|
||||
// Issue any required generation validation calls to the control plane
|
||||
if let Err(DeletionQueueError::ShuttingDown) = self.validate_lists().await {
|
||||
warn!("Shutting down");
|
||||
return;
|
||||
}
|
||||
|
||||
// Submit all keys from pending DeletionLists into the executor
|
||||
for list in self.pending_lists.drain(..) {
|
||||
let objects = list.take_paths();
|
||||
if let Err(_e) = self.tx.send(ExecutorMessage::Delete(objects)).await {
|
||||
warn!("Shutting down");
|
||||
return;
|
||||
};
|
||||
}
|
||||
|
||||
// Flush the executor to ensure all the operations we just submitted have been executed
|
||||
let (tx, rx) = tokio::sync::oneshot::channel::<()>();
|
||||
let flush_op = FlushOp { tx };
|
||||
if let Err(_e) = self.tx.send(ExecutorMessage::Flush(flush_op)).await {
|
||||
warn!("Shutting down");
|
||||
return;
|
||||
};
|
||||
if rx.await.is_err() {
|
||||
warn!("Shutting down");
|
||||
return;
|
||||
}
|
||||
|
||||
// After flush, we are assured that all contents of the pending lists
|
||||
// are executed
|
||||
self.pending_key_count = 0;
|
||||
self.executed_lists.append(&mut self.pending_lists);
|
||||
|
||||
// Erase the lists we executed
|
||||
self.cleanup_lists().await;
|
||||
}
|
||||
|
||||
pub async fn background(&mut self) {
|
||||
// TODO: if we would like to be able to defer deletions while a Layer still has
|
||||
// refs (but it will be elegible for deletion after process ends), then we may
|
||||
// add an ephemeral part to BackendQueueMessage::Delete that tracks which keys
|
||||
// in the deletion list may not be deleted yet, with guards to block on while
|
||||
// we wait to proceed.
|
||||
|
||||
loop {
|
||||
let msg = match tokio::time::timeout(EXECUTE_IDLE_DEADLINE, self.rx.recv()).await {
|
||||
Ok(Some(m)) => m,
|
||||
Ok(None) => {
|
||||
// All queue senders closed
|
||||
info!("Shutting down");
|
||||
break;
|
||||
}
|
||||
Err(_) => {
|
||||
// Timeout, we hit deadline to execute whatever we have in hand. These functions will
|
||||
// return immediately if no work is pending
|
||||
self.flush().await;
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match msg {
|
||||
BackendQueueMessage::Delete(list) => {
|
||||
self.pending_key_count += list.len();
|
||||
self.pending_lists.push(list);
|
||||
|
||||
if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
|
||||
self.flush().await;
|
||||
}
|
||||
}
|
||||
BackendQueueMessage::Flush(op) => {
|
||||
self.flush().await;
|
||||
op.fire();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,143 +0,0 @@
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::RemotePath;
|
||||
use remote_storage::MAX_KEYS_PER_DELETE;
|
||||
use std::time::Duration;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::metrics::DELETION_QUEUE_ERRORS;
|
||||
use crate::metrics::DELETION_QUEUE_EXECUTED;
|
||||
|
||||
use super::DeletionQueueError;
|
||||
use super::FlushOp;
|
||||
|
||||
const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
|
||||
|
||||
pub(super) enum ExecutorMessage {
|
||||
Delete(Vec<RemotePath>),
|
||||
Flush(FlushOp),
|
||||
}
|
||||
|
||||
/// Non-persistent deletion queue, for coalescing multiple object deletes into
|
||||
/// larger DeleteObjects requests.
|
||||
pub struct ExecutorWorker {
|
||||
// Accumulate up to 1000 keys for the next deletion operation
|
||||
accumulator: Vec<RemotePath>,
|
||||
|
||||
rx: tokio::sync::mpsc::Receiver<ExecutorMessage>,
|
||||
|
||||
cancel: CancellationToken,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
}
|
||||
|
||||
impl ExecutorWorker {
|
||||
pub(super) fn new(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
rx: tokio::sync::mpsc::Receiver<ExecutorMessage>,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
Self {
|
||||
remote_storage,
|
||||
rx,
|
||||
cancel,
|
||||
accumulator: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrap the remote `delete_objects` with a failpoint
|
||||
pub async fn remote_delete(&self) -> Result<(), anyhow::Error> {
|
||||
fail::fail_point!("deletion-queue-before-execute", |_| {
|
||||
info!("Skipping execution, failpoint set");
|
||||
DELETION_QUEUE_ERRORS
|
||||
.with_label_values(&["failpoint"])
|
||||
.inc();
|
||||
Err(anyhow::anyhow!("failpoint hit"))
|
||||
});
|
||||
|
||||
self.remote_storage.delete_objects(&self.accumulator).await
|
||||
}
|
||||
|
||||
/// Block until everything in accumulator has been executed
|
||||
pub async fn flush(&mut self) -> Result<(), DeletionQueueError> {
|
||||
while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
|
||||
match self.remote_delete().await {
|
||||
Ok(()) => {
|
||||
// Note: we assume that the remote storage layer returns Ok(()) if some
|
||||
// or all of the deleted objects were already gone.
|
||||
DELETION_QUEUE_EXECUTED.inc_by(self.accumulator.len() as u64);
|
||||
info!(
|
||||
"Executed deletion batch {}..{}",
|
||||
self.accumulator
|
||||
.first()
|
||||
.expect("accumulator should be non-empty"),
|
||||
self.accumulator
|
||||
.last()
|
||||
.expect("accumulator should be non-empty"),
|
||||
);
|
||||
self.accumulator.clear();
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("DeleteObjects request failed: {e:#}, will retry");
|
||||
DELETION_QUEUE_ERRORS.with_label_values(&["execute"]).inc();
|
||||
}
|
||||
};
|
||||
}
|
||||
if self.cancel.is_cancelled() {
|
||||
// Expose an error because we may not have actually flushed everything
|
||||
Err(DeletionQueueError::ShuttingDown)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn background(&mut self) -> Result<(), DeletionQueueError> {
|
||||
self.accumulator.reserve(MAX_KEYS_PER_DELETE);
|
||||
|
||||
loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(DeletionQueueError::ShuttingDown);
|
||||
}
|
||||
|
||||
let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
|
||||
Ok(Some(m)) => m,
|
||||
Ok(None) => {
|
||||
// All queue senders closed
|
||||
info!("Shutting down");
|
||||
return Err(DeletionQueueError::ShuttingDown);
|
||||
}
|
||||
Err(_) => {
|
||||
// Timeout, we hit deadline to execute whatever we have in hand. These functions will
|
||||
// return immediately if no work is pending
|
||||
self.flush().await?;
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match msg {
|
||||
ExecutorMessage::Delete(mut list) => {
|
||||
while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
|
||||
if self.accumulator.len() == MAX_KEYS_PER_DELETE {
|
||||
self.flush().await?;
|
||||
// If we have received this number of keys, proceed with attempting to execute
|
||||
assert_eq!(self.accumulator.len(), 0);
|
||||
}
|
||||
|
||||
let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
|
||||
let take_count = std::cmp::min(available_slots, list.len());
|
||||
for path in list.drain(list.len() - take_count..) {
|
||||
self.accumulator.push(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
ExecutorMessage::Flush(flush_op) => {
|
||||
// If flush() errors, we drop the flush_op and the caller will get
|
||||
// an error recv()'ing their oneshot channel.
|
||||
self.flush().await?;
|
||||
flush_op.fire();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,376 +0,0 @@
|
||||
use super::BackendQueueMessage;
|
||||
use super::DeletionHeader;
|
||||
use super::DeletionList;
|
||||
use super::FlushOp;
|
||||
|
||||
use std::fs::create_dir_all;
|
||||
use std::time::Duration;
|
||||
|
||||
use regex::Regex;
|
||||
use remote_storage::RemotePath;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::TenantId;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::metrics::DELETION_QUEUE_ERRORS;
|
||||
use crate::metrics::DELETION_QUEUE_SUBMITTED;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
|
||||
// The number of keys in a DeletionList before we will proactively persist it
|
||||
// (without reaching a flush deadline). This aims to deliver objects of the order
|
||||
// of magnitude 1MB when we are under heavy delete load.
|
||||
const DELETION_LIST_TARGET_SIZE: usize = 16384;
|
||||
|
||||
// Ordinarily, we only flush to DeletionList periodically, to bound the window during
|
||||
// which we might leak objects from not flushing a DeletionList after
|
||||
// the objects are already unlinked from timeline metadata.
|
||||
const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
|
||||
|
||||
// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
|
||||
// more objects before doing the flush.
|
||||
const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(super) struct DeletionOp {
|
||||
pub(super) tenant_id: TenantId,
|
||||
pub(super) timeline_id: TimelineId,
|
||||
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
|
||||
// have a config object handy to project it to a remote key, and need the consuming worker
|
||||
// to do it for you.
|
||||
pub(super) layers: Vec<(LayerFileName, Generation)>,
|
||||
pub(super) objects: Vec<RemotePath>,
|
||||
|
||||
/// The _current_ generation of the Tenant attachment in which we are enqueuing
|
||||
/// this deletion.
|
||||
pub(super) generation: Generation,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(super) enum FrontendQueueMessage {
|
||||
Delete(DeletionOp),
|
||||
// Wait until all prior deletions make it into a persistent DeletionList
|
||||
Flush(FlushOp),
|
||||
// Wait until all prior deletions have been executed (i.e. objects are actually deleted)
|
||||
FlushExecute(FlushOp),
|
||||
}
|
||||
|
||||
pub struct FrontendQueueWorker {
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
// Incoming frontend requests to delete some keys
|
||||
rx: tokio::sync::mpsc::Receiver<FrontendQueueMessage>,
|
||||
|
||||
// Outbound requests to the backend to execute deletion lists we have composed.
|
||||
tx: tokio::sync::mpsc::Sender<BackendQueueMessage>,
|
||||
|
||||
// The list we are currently building, contains a buffer of keys to delete
|
||||
// and our next sequence number
|
||||
pending: DeletionList,
|
||||
|
||||
// These FlushOps should fire the next time we flush
|
||||
pending_flushes: Vec<FlushOp>,
|
||||
|
||||
// Worker loop is torn down when this fires.
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl FrontendQueueWorker {
|
||||
pub(super) fn new(
|
||||
conf: &'static PageServerConf,
|
||||
rx: tokio::sync::mpsc::Receiver<FrontendQueueMessage>,
|
||||
tx: tokio::sync::mpsc::Sender<BackendQueueMessage>,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
Self {
|
||||
pending: DeletionList::new(1),
|
||||
conf,
|
||||
rx,
|
||||
tx,
|
||||
pending_flushes: Vec::new(),
|
||||
cancel,
|
||||
}
|
||||
}
|
||||
async fn upload_pending_list(&mut self) -> anyhow::Result<()> {
|
||||
let path = self.conf.deletion_list_path(self.pending.sequence);
|
||||
|
||||
let bytes = serde_json::to_vec(&self.pending).expect("Failed to serialize deletion list");
|
||||
tokio::fs::write(&path, &bytes).await?;
|
||||
tokio::fs::File::open(&path).await?.sync_all().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Try to flush `list` to persistent storage
|
||||
///
|
||||
/// This does not return errors, because on failure to flush we do not lose
|
||||
/// any state: flushing will be retried implicitly on the next deadline
|
||||
async fn flush(&mut self) {
|
||||
if self.pending.is_empty() {
|
||||
for f in self.pending_flushes.drain(..) {
|
||||
f.fire();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
match self.upload_pending_list().await {
|
||||
Ok(_) => {
|
||||
info!(sequence = self.pending.sequence, "Stored deletion list");
|
||||
|
||||
for f in self.pending_flushes.drain(..) {
|
||||
f.fire();
|
||||
}
|
||||
|
||||
let onward_list = self.pending.drain();
|
||||
|
||||
// We have consumed out of pending: reset it for the next incoming deletions to accumulate there
|
||||
self.pending = DeletionList::new(self.pending.sequence + 1);
|
||||
|
||||
if let Err(e) = self.tx.send(BackendQueueMessage::Delete(onward_list)).await {
|
||||
// This is allowed to fail: it will only happen if the backend worker is shut down,
|
||||
// so we can just drop this on the floor.
|
||||
info!("Deletion list dropped, this is normal during shutdown ({e:#})");
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
DELETION_QUEUE_ERRORS.with_label_values(&["put_list"]).inc();
|
||||
warn!(
|
||||
sequence = self.pending.sequence,
|
||||
"Failed to write deletion list to remote storage, will retry later ({e:#})"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn recover(&mut self) -> Result<(), anyhow::Error> {
|
||||
// Load header: this is not required to be present, e.g. when a pageserver first runs
|
||||
let header_path = self.conf.deletion_header_path();
|
||||
|
||||
// Synchronous, but we only do it once per process lifetime so it's tolerable
|
||||
create_dir_all(&self.conf.deletion_prefix())?;
|
||||
|
||||
let header_bytes = match tokio::fs::read(&header_path).await {
|
||||
Ok(h) => Ok(Some(h)),
|
||||
Err(e) => {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
debug!(
|
||||
"Deletion header {0} not found, first start?",
|
||||
header_path.display()
|
||||
);
|
||||
Ok(None)
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}?;
|
||||
|
||||
if let Some(header_bytes) = header_bytes {
|
||||
if let Some(header) = match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
|
||||
Ok(h) => Some(h),
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to deserialize deletion header, ignoring {0}: {e:#}",
|
||||
header_path.display()
|
||||
);
|
||||
// This should never happen unless we make a mistake with our serialization.
|
||||
// Ignoring a deletion header is not consequential for correctnes because all deletions
|
||||
// are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
|
||||
None
|
||||
}
|
||||
} {
|
||||
self.pending.sequence =
|
||||
std::cmp::max(self.pending.sequence, header.last_deleted_list_seq + 1);
|
||||
};
|
||||
};
|
||||
|
||||
let mut dir = match tokio::fs::read_dir(&self.conf.deletion_prefix()).await {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to open deletion list directory {0}: {e:#}",
|
||||
header_path.display()
|
||||
);
|
||||
|
||||
// Give up: if we can't read the deletion list directory, we probably can't
|
||||
// write lists into it later, so the queue won't work.
|
||||
return Err(e.into());
|
||||
}
|
||||
};
|
||||
|
||||
let list_name_pattern = Regex::new("([a-zA-Z0-9]{16})-([a-zA-Z0-9]{2}).list").unwrap();
|
||||
|
||||
let mut seqs: Vec<u64> = Vec::new();
|
||||
while let Some(dentry) = dir.next_entry().await? {
|
||||
let file_name = dentry.file_name().to_owned();
|
||||
let basename = file_name.to_string_lossy();
|
||||
let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
|
||||
m.get(1)
|
||||
.expect("Non optional group should be present")
|
||||
.as_str()
|
||||
} else {
|
||||
warn!("Unexpected key in deletion queue: {basename}");
|
||||
continue;
|
||||
};
|
||||
|
||||
let seq: u64 = match u64::from_str_radix(seq_part, 16) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
warn!("Malformed key '{basename}': {e}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
seqs.push(seq);
|
||||
}
|
||||
seqs.sort();
|
||||
|
||||
// Initialize the next sequence number in the frontend based on the maximum of the highest list we see,
|
||||
// and the last list that was deleted according to the header. Combined with writing out the header
|
||||
// prior to deletions, this guarnatees no re-use of sequence numbers.
|
||||
if let Some(max_list_seq) = seqs.last() {
|
||||
self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
|
||||
}
|
||||
|
||||
for s in seqs {
|
||||
let list_path = self.conf.deletion_list_path(s);
|
||||
let list_bytes = tokio::fs::read(&list_path).await?;
|
||||
|
||||
let deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
// Drop the list on the floor: any objects it referenced will be left behind
|
||||
// for scrubbing to clean up. This should never happen unless we have a serialization bug.
|
||||
warn!(sequence = s, "Failed to deserialize deletion list: {e}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// We will drop out of recovery if this fails: it indicates that we are shutting down
|
||||
// or the backend has panicked
|
||||
DELETION_QUEUE_SUBMITTED.inc_by(deletion_list.len() as u64);
|
||||
self.tx
|
||||
.send(BackendQueueMessage::Delete(deletion_list))
|
||||
.await?;
|
||||
}
|
||||
|
||||
info!(next_sequence = self.pending.sequence, "Replay complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is the front-end ingest, where we bundle up deletion requests into DeletionList
|
||||
/// and write them out, for later
|
||||
pub async fn background(&mut self) {
|
||||
info!("Started deletion frontend worker");
|
||||
|
||||
let mut recovered: bool = false;
|
||||
|
||||
while !self.cancel.is_cancelled() {
|
||||
let timeout = if self.pending_flushes.is_empty() {
|
||||
FRONTEND_DEFAULT_TIMEOUT
|
||||
} else {
|
||||
FRONTEND_FLUSHING_TIMEOUT
|
||||
};
|
||||
|
||||
let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
|
||||
Ok(Some(msg)) => msg,
|
||||
Ok(None) => {
|
||||
// Queue sender destroyed, shutting down
|
||||
break;
|
||||
}
|
||||
Err(_) => {
|
||||
// Hit deadline, flush.
|
||||
self.flush().await;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// On first message, do recovery. This avoids unnecessary recovery very
|
||||
// early in startup, and simplifies testing by avoiding a 404 reading the
|
||||
// header on every first pageserver startup.
|
||||
if !recovered {
|
||||
// Before accepting any input from this pageserver lifetime, recover all deletion lists that are in S3
|
||||
if let Err(e) = self.recover().await {
|
||||
// This should only happen in truly unrecoverable cases, like the recovery finding that the backend
|
||||
// queue receiver has been dropped.
|
||||
info!("Deletion queue recover aborted, deletion queue will not proceed ({e})");
|
||||
return;
|
||||
} else {
|
||||
recovered = true;
|
||||
}
|
||||
}
|
||||
|
||||
match msg {
|
||||
FrontendQueueMessage::Delete(op) => {
|
||||
debug!(
|
||||
"Delete: ingesting {0} layers, {1} other objects",
|
||||
op.layers.len(),
|
||||
op.objects.len()
|
||||
);
|
||||
|
||||
let mut layer_paths = Vec::new();
|
||||
for (layer, generation) in op.layers {
|
||||
layer_paths.push(remote_layer_path(
|
||||
&op.tenant_id,
|
||||
&op.timeline_id,
|
||||
&layer,
|
||||
generation,
|
||||
));
|
||||
}
|
||||
layer_paths.extend(op.objects);
|
||||
|
||||
if self.pending.push(
|
||||
&op.tenant_id,
|
||||
&op.timeline_id,
|
||||
op.generation,
|
||||
&mut layer_paths,
|
||||
) == false
|
||||
{
|
||||
self.flush().await;
|
||||
let retry = self.pending.push(
|
||||
&op.tenant_id,
|
||||
&op.timeline_id,
|
||||
op.generation,
|
||||
&mut layer_paths,
|
||||
);
|
||||
if retry != true {
|
||||
// Unexpeted: after we flush, we should have
|
||||
// drained self.pending, so a conflict on
|
||||
// generation numbers should be impossible.
|
||||
tracing::error!(
|
||||
"Failed to enqueue deletions, leaking objects. This is a bug."
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
FrontendQueueMessage::Flush(op) => {
|
||||
if self.pending.is_empty() {
|
||||
// Execute immediately
|
||||
debug!("Flush: No pending objects, flushing immediately");
|
||||
op.fire()
|
||||
} else {
|
||||
// Execute next time we flush
|
||||
debug!("Flush: adding to pending flush list for next deadline flush");
|
||||
self.pending_flushes.push(op);
|
||||
}
|
||||
}
|
||||
FrontendQueueMessage::FlushExecute(op) => {
|
||||
debug!("FlushExecute: passing through to backend");
|
||||
// We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
|
||||
if let Err(e) = self.tx.send(BackendQueueMessage::Flush(op)).await {
|
||||
info!("Can't flush, shutting down ({e})");
|
||||
// Caller will get error when their oneshot sender was dropped.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
|
||||
self.flush().await;
|
||||
}
|
||||
}
|
||||
info!("Deletion queue shut down.");
|
||||
}
|
||||
}
|
||||
@@ -52,29 +52,6 @@ paths:
|
||||
schema:
|
||||
type: object
|
||||
|
||||
/v1/deletion_queue/flush:
|
||||
parameters:
|
||||
- name: execute
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: boolean
|
||||
description:
|
||||
If true, attempt to execute deletions. If false, just flush deletions to persistent deletion lists.
|
||||
put:
|
||||
description: Execute any deletions currently enqueued
|
||||
security: []
|
||||
responses:
|
||||
"200":
|
||||
description: |
|
||||
Flush completed: if execute was true, then enqueued deletions have been completed. If execute was false,
|
||||
then enqueued deletions have been persisted to deletion lists, and may have been completed.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -406,6 +383,7 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
|
||||
post:
|
||||
description: |
|
||||
Schedules attach operation to happen in the background for the given tenant.
|
||||
@@ -1042,9 +1020,6 @@ components:
|
||||
properties:
|
||||
config:
|
||||
$ref: '#/components/schemas/TenantConfig'
|
||||
generation:
|
||||
type: integer
|
||||
description: Attachment generation number.
|
||||
TenantConfigRequest:
|
||||
allOf:
|
||||
- $ref: '#/components/schemas/TenantConfig'
|
||||
|
||||
@@ -23,7 +23,6 @@ use super::models::{
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::{DeletionQueue, DeletionQueueError};
|
||||
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
@@ -33,13 +32,11 @@ use crate::tenant::mgr::{
|
||||
};
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
generation::Generation,
|
||||
http::{
|
||||
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
|
||||
error::{ApiError, HttpErrorBody},
|
||||
@@ -59,7 +56,6 @@ struct State {
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
deletion_queue: DeletionQueue,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
}
|
||||
@@ -69,7 +65,6 @@ impl State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
deletion_queue: DeletionQueue,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
) -> anyhow::Result<Self> {
|
||||
@@ -83,7 +78,6 @@ impl State {
|
||||
allowlist_routes,
|
||||
remote_storage,
|
||||
broker_client,
|
||||
deletion_queue,
|
||||
disk_usage_eviction_state,
|
||||
})
|
||||
}
|
||||
@@ -478,7 +472,7 @@ async fn tenant_attach_handler(
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let maybe_body: Option<TenantAttachRequest> = json_request_or_empty_body(&mut request).await?;
|
||||
let tenant_conf = match &maybe_body {
|
||||
let tenant_conf = match maybe_body {
|
||||
Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?,
|
||||
None => TenantConfOpt::default(),
|
||||
};
|
||||
@@ -489,30 +483,13 @@ async fn tenant_attach_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let generation = if state.conf.control_plane_api.is_some() {
|
||||
// If we have been configured with a control plane URI, then generations are
|
||||
// mandatory, as we will attempt to re-attach on startup.
|
||||
maybe_body
|
||||
.as_ref()
|
||||
.map(|tar| tar.generation)
|
||||
.flatten()
|
||||
.map(|g| Generation::new(g))
|
||||
.ok_or(ApiError::BadRequest(anyhow!(
|
||||
"generation attribute missing"
|
||||
)))?
|
||||
} else {
|
||||
Generation::none()
|
||||
};
|
||||
|
||||
if let Some(remote_storage) = &state.remote_storage {
|
||||
mgr::attach_tenant(
|
||||
state.conf,
|
||||
tenant_id,
|
||||
generation,
|
||||
tenant_conf,
|
||||
state.broker_client.clone(),
|
||||
remote_storage.clone(),
|
||||
&state.deletion_queue,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("tenant_attach", %tenant_id))
|
||||
@@ -575,7 +552,6 @@ async fn tenant_load_handler(
|
||||
tenant_id,
|
||||
state.broker_client.clone(),
|
||||
state.remote_storage.clone(),
|
||||
&state.deletion_queue,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("load", %tenant_id))
|
||||
@@ -891,12 +867,6 @@ async fn tenant_create_handler(
|
||||
let tenant_conf =
|
||||
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
// TODO: make generation mandatory here once control plane supports it.
|
||||
let generation = request_data
|
||||
.generation
|
||||
.map(|g| Generation::new(g))
|
||||
.unwrap_or(Generation::none());
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let state = get_state(&request);
|
||||
@@ -905,10 +875,8 @@ async fn tenant_create_handler(
|
||||
state.conf,
|
||||
tenant_conf,
|
||||
target_tenant_id,
|
||||
generation,
|
||||
state.broker_client.clone(),
|
||||
state.remote_storage.clone(),
|
||||
&state.deletion_queue,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
|
||||
@@ -1149,48 +1117,6 @@ async fn always_panic_handler(
|
||||
json_response(StatusCode::NO_CONTENT, ())
|
||||
}
|
||||
|
||||
async fn deletion_queue_flush(
|
||||
r: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&r);
|
||||
|
||||
if state.remote_storage.is_none() {
|
||||
// Nothing to do if remote storage is disabled.
|
||||
return json_response(StatusCode::OK, ());
|
||||
}
|
||||
|
||||
let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
|
||||
|
||||
let queue_client = state.deletion_queue.new_client();
|
||||
|
||||
tokio::select! {
|
||||
flush_result = async {
|
||||
if execute {
|
||||
queue_client.flush_execute().await
|
||||
} else {
|
||||
queue_client.flush().await
|
||||
}
|
||||
} => {
|
||||
match flush_result {
|
||||
Ok(())=> {
|
||||
json_response(StatusCode::OK, ())
|
||||
},
|
||||
Err(e) => {
|
||||
match e {
|
||||
DeletionQueueError::ShuttingDown => {
|
||||
Err(ApiError::ShuttingDown)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
_ = cancel.cancelled() => {
|
||||
Err(ApiError::ShuttingDown)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn disk_usage_eviction_run(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1400,7 +1326,6 @@ pub fn make_router(
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
broker_client: BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
deletion_queue: DeletionQueue,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||
let spec = include_bytes!("openapi_spec.yml");
|
||||
@@ -1430,7 +1355,6 @@ pub fn make_router(
|
||||
conf,
|
||||
auth,
|
||||
remote_storage,
|
||||
deletion_queue,
|
||||
broker_client,
|
||||
disk_usage_eviction_state,
|
||||
)
|
||||
@@ -1515,9 +1439,6 @@ pub fn make_router(
|
||||
.put("/v1/disk_usage_eviction/run", |r| {
|
||||
api_handler(r, disk_usage_eviction_run)
|
||||
})
|
||||
.put("/v1/deletion_queue/flush", |r| {
|
||||
api_handler(r, deletion_queue_flush)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/break", |r| {
|
||||
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
||||
})
|
||||
|
||||
@@ -3,7 +3,6 @@ pub mod basebackup;
|
||||
pub mod config;
|
||||
pub mod consumption_metrics;
|
||||
pub mod context;
|
||||
pub mod deletion_queue;
|
||||
pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
|
||||
@@ -795,31 +795,6 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static DELETION_QUEUE_SUBMITTED: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_deletion_queue_submitted_total",
|
||||
"Number of objects submitted for deletion"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static DELETION_QUEUE_EXECUTED: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_deletion_queue_executed_total",
|
||||
"Number of objects deleted"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static DELETION_QUEUE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_deletion_queue_errors_total",
|
||||
"Incremented on retryable remote I/O errors writing deletion lists or executing deletions.",
|
||||
&["op_kind"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_remote_timeline_client_bytes_started",
|
||||
|
||||
@@ -75,7 +75,10 @@
|
||||
use std::{
|
||||
collections::{hash_map::Entry, HashMap},
|
||||
convert::TryInto,
|
||||
sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
|
||||
sync::{
|
||||
atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
|
||||
RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError,
|
||||
},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -159,7 +162,7 @@ struct Version {
|
||||
}
|
||||
|
||||
struct Slot {
|
||||
inner: tokio::sync::RwLock<SlotInner>,
|
||||
inner: RwLock<SlotInner>,
|
||||
usage_count: AtomicU8,
|
||||
}
|
||||
|
||||
@@ -200,11 +203,6 @@ impl Slot {
|
||||
Err(usage_count) => usage_count,
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the usage count to a specific value.
|
||||
fn set_usage_count(&self, count: u8) {
|
||||
self.usage_count.store(count, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PageCache {
|
||||
@@ -217,9 +215,9 @@ pub struct PageCache {
|
||||
///
|
||||
/// If you add support for caching different kinds of objects, each object kind
|
||||
/// can have a separate mapping map, next to this field.
|
||||
materialized_page_map: std::sync::RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
|
||||
materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
|
||||
|
||||
immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,
|
||||
immutable_page_map: RwLock<HashMap<(FileId, u32), usize>>,
|
||||
|
||||
/// The actual buffers with their metadata.
|
||||
slots: Box<[Slot]>,
|
||||
@@ -235,7 +233,7 @@ pub struct PageCache {
|
||||
/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
|
||||
/// until the guard is dropped.
|
||||
///
|
||||
pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);
|
||||
pub struct PageReadGuard<'i>(RwLockReadGuard<'i, SlotInner>);
|
||||
|
||||
impl std::ops::Deref for PageReadGuard<'_> {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
@@ -262,10 +260,9 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
|
||||
/// to initialize.
|
||||
///
|
||||
pub struct PageWriteGuard<'i> {
|
||||
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
|
||||
inner: RwLockWriteGuard<'i, SlotInner>,
|
||||
|
||||
// Are the page contents currently valid?
|
||||
// Used to mark pages as invalid that are assigned but not yet filled with data.
|
||||
valid: bool,
|
||||
}
|
||||
|
||||
@@ -340,7 +337,7 @@ impl PageCache {
|
||||
/// The 'lsn' is an upper bound, this will return the latest version of
|
||||
/// the given block, but not newer than 'lsn'. Returns the actual LSN of the
|
||||
/// returned page.
|
||||
pub async fn lookup_materialized_page(
|
||||
pub fn lookup_materialized_page(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -360,7 +357,7 @@ impl PageCache {
|
||||
lsn,
|
||||
};
|
||||
|
||||
if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
|
||||
if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
|
||||
if let CacheKey::MaterializedPage {
|
||||
hash_key: _,
|
||||
lsn: available_lsn,
|
||||
@@ -387,7 +384,7 @@ impl PageCache {
|
||||
///
|
||||
/// Store an image of the given page in the cache.
|
||||
///
|
||||
pub async fn memorize_materialized_page(
|
||||
pub fn memorize_materialized_page(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -404,7 +401,7 @@ impl PageCache {
|
||||
lsn,
|
||||
};
|
||||
|
||||
match self.lock_for_write(&cache_key).await? {
|
||||
match self.lock_for_write(&cache_key)? {
|
||||
WriteBufResult::Found(write_guard) => {
|
||||
// We already had it in cache. Another thread must've put it there
|
||||
// concurrently. Check that it had the same contents that we
|
||||
@@ -422,14 +419,31 @@ impl PageCache {
|
||||
|
||||
// Section 1.2: Public interface functions for working with immutable file pages.
|
||||
|
||||
pub async fn read_immutable_buf(
|
||||
&self,
|
||||
file_id: FileId,
|
||||
blkno: u32,
|
||||
) -> anyhow::Result<ReadBufResult> {
|
||||
pub fn read_immutable_buf(&self, file_id: FileId, blkno: u32) -> anyhow::Result<ReadBufResult> {
|
||||
let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
|
||||
|
||||
self.lock_for_read(&mut cache_key).await
|
||||
self.lock_for_read(&mut cache_key)
|
||||
}
|
||||
|
||||
/// Immediately drop all buffers belonging to given file
|
||||
pub fn drop_buffers_for_immutable(&self, drop_file_id: FileId) {
|
||||
for slot_idx in 0..self.slots.len() {
|
||||
let slot = &self.slots[slot_idx];
|
||||
|
||||
let mut inner = slot.inner.write().unwrap();
|
||||
if let Some(key) = &inner.key {
|
||||
match key {
|
||||
CacheKey::ImmutableFilePage { file_id, blkno: _ }
|
||||
if *file_id == drop_file_id =>
|
||||
{
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(key);
|
||||
inner.key = None;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
@@ -449,14 +463,14 @@ impl PageCache {
|
||||
///
|
||||
/// If no page is found, returns None and *cache_key is left unmodified.
|
||||
///
|
||||
async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
|
||||
fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
|
||||
let cache_key_orig = cache_key.clone();
|
||||
if let Some(slot_idx) = self.search_mapping(cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.read().await;
|
||||
let inner = slot.inner.read().unwrap();
|
||||
if inner.key.as_ref() == Some(cache_key) {
|
||||
slot.inc_usage_count();
|
||||
return Some(PageReadGuard(inner));
|
||||
@@ -497,7 +511,7 @@ impl PageCache {
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
async fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
|
||||
fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
|
||||
let (read_access, hit) = match cache_key {
|
||||
CacheKey::MaterializedPage { .. } => {
|
||||
unreachable!("Materialized pages use lookup_materialized_page")
|
||||
@@ -512,7 +526,7 @@ impl PageCache {
|
||||
let mut is_first_iteration = true;
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
|
||||
if let Some(read_guard) = self.try_lock_for_read(cache_key) {
|
||||
if is_first_iteration {
|
||||
hit.inc();
|
||||
}
|
||||
@@ -542,7 +556,7 @@ impl PageCache {
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
slot.set_usage_count(1);
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
return Ok(ReadBufResult::NotFound(PageWriteGuard {
|
||||
inner,
|
||||
@@ -555,13 +569,13 @@ impl PageCache {
|
||||
/// found, returns None.
|
||||
///
|
||||
/// When locking a page for writing, the search criteria is always "exact".
|
||||
async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
|
||||
fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
|
||||
if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we don't released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.write().await;
|
||||
let inner = slot.inner.write().unwrap();
|
||||
if inner.key.as_ref() == Some(cache_key) {
|
||||
slot.inc_usage_count();
|
||||
return Some(PageWriteGuard { inner, valid: true });
|
||||
@@ -574,10 +588,10 @@ impl PageCache {
|
||||
///
|
||||
/// Similar to lock_for_read(), but the returned buffer is write-locked and
|
||||
/// may be modified by the caller even if it's already found in the cache.
|
||||
async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
|
||||
fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
|
||||
if let Some(write_guard) = self.try_lock_for_write(cache_key) {
|
||||
return Ok(WriteBufResult::Found(write_guard));
|
||||
}
|
||||
|
||||
@@ -603,7 +617,7 @@ impl PageCache {
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
slot.set_usage_count(1);
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
return Ok(WriteBufResult::NotFound(PageWriteGuard {
|
||||
inner,
|
||||
@@ -758,7 +772,7 @@ impl PageCache {
|
||||
/// Find a slot to evict.
|
||||
///
|
||||
/// On return, the slot is empty and write-locked.
|
||||
fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
|
||||
fn find_victim(&self) -> anyhow::Result<(usize, RwLockWriteGuard<SlotInner>)> {
|
||||
let iter_limit = self.slots.len() * 10;
|
||||
let mut iters = 0;
|
||||
loop {
|
||||
@@ -770,7 +784,10 @@ impl PageCache {
|
||||
if slot.dec_usage_count() == 0 {
|
||||
let mut inner = match slot.inner.try_write() {
|
||||
Ok(inner) => inner,
|
||||
Err(_err) => {
|
||||
Err(TryLockError::Poisoned(err)) => {
|
||||
anyhow::bail!("buffer lock was poisoned: {err:?}")
|
||||
}
|
||||
Err(TryLockError::WouldBlock) => {
|
||||
// If we have looped through the whole buffer pool 10 times
|
||||
// and still haven't found a victim buffer, something's wrong.
|
||||
// Maybe all the buffers were in locked. That could happen in
|
||||
@@ -799,8 +816,6 @@ impl PageCache {
|
||||
fn new(num_pages: usize) -> Self {
|
||||
assert!(num_pages > 0, "page cache size must be > 0");
|
||||
|
||||
// We use Box::leak here and into_boxed_slice to avoid leaking uninitialized
|
||||
// memory that Vec's might contain.
|
||||
let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
|
||||
|
||||
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
|
||||
@@ -814,7 +829,7 @@ impl PageCache {
|
||||
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
|
||||
|
||||
Slot {
|
||||
inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
|
||||
inner: RwLock::new(SlotInner { key: None, buf }),
|
||||
usage_count: AtomicU8::new(0),
|
||||
}
|
||||
})
|
||||
|
||||
@@ -59,7 +59,6 @@ use self::timeline::EvictionTaskTenantState;
|
||||
use self::timeline::TimelineResources;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::metrics::TENANT_ACTIVATION;
|
||||
@@ -86,7 +85,6 @@ pub use pageserver_api::models::TenantState;
|
||||
use toml_edit;
|
||||
use utils::{
|
||||
crashsafe,
|
||||
generation::Generation,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::{Lsn, RecordLsn},
|
||||
};
|
||||
@@ -121,7 +119,7 @@ mod span;
|
||||
|
||||
pub mod metadata;
|
||||
mod par_fsync;
|
||||
pub mod remote_timeline_client;
|
||||
mod remote_timeline_client;
|
||||
pub mod storage_layer;
|
||||
|
||||
pub mod config;
|
||||
@@ -158,7 +156,6 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
|
||||
pub struct TenantSharedResources {
|
||||
pub broker_client: storage_broker::BrokerClientChannel,
|
||||
pub remote_storage: Option<GenericRemoteStorage>,
|
||||
pub deletion_queue_client: DeletionQueueClient,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -181,10 +178,6 @@ pub struct Tenant {
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
|
||||
tenant_id: TenantId,
|
||||
|
||||
// The remote storage generation, used to protect S3 objects from split-brain
|
||||
generation: Generation,
|
||||
|
||||
timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
|
||||
// This mutex prevents creation of new timelines during GC.
|
||||
// Adding yet another mutex (in addition to `timelines`) is needed because holding
|
||||
@@ -198,9 +191,6 @@ pub struct Tenant {
|
||||
// provides access to timeline data sitting in the remote storage
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
|
||||
// Access to global deletion queue for when this tenant wants to schedule a deletion
|
||||
deletion_queue_client: Option<DeletionQueueClient>,
|
||||
|
||||
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
cached_synthetic_tenant_size: Arc<AtomicU64>,
|
||||
@@ -432,53 +422,13 @@ impl Tenant {
|
||||
init_order,
|
||||
CreateTimelineCause::Load,
|
||||
)?;
|
||||
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
anyhow::ensure!(
|
||||
disk_consistent_lsn.is_valid(),
|
||||
new_disk_consistent_lsn.is_valid(),
|
||||
"Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
disk_consistent_lsn,
|
||||
up_to_date_metadata.disk_consistent_lsn(),
|
||||
"these are used interchangeably"
|
||||
);
|
||||
|
||||
// Save the metadata file to local disk.
|
||||
if !picked_local {
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
up_to_date_metadata,
|
||||
first_save,
|
||||
)
|
||||
.context("save_metadata")?;
|
||||
}
|
||||
|
||||
let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
|
||||
|
||||
if let Some(index_part) = index_part {
|
||||
timeline
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.init_upload_queue(index_part)?;
|
||||
} else if self.remote_storage.is_some() {
|
||||
// No data on the remote storage, but we have local metadata file. We can end up
|
||||
// here with timeline_create being interrupted before finishing index part upload.
|
||||
// By doing what we do here, the index part upload is retried.
|
||||
// If control plane retries timeline creation in the meantime, the mgmt API handler
|
||||
// for timeline creation will coalesce on the upload we queue here.
|
||||
let rtc = timeline.remote_client.as_ref().unwrap();
|
||||
rtc.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
|
||||
rtc.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
|
||||
}
|
||||
|
||||
timeline
|
||||
.load_layer_map(
|
||||
disk_consistent_lsn,
|
||||
remote_startup_data.map(|x| x.index_part),
|
||||
)
|
||||
.load_layer_map(new_disk_consistent_lsn)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
|
||||
@@ -502,6 +452,19 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
if self.remote_storage.is_some() {
|
||||
// Reconcile local state with remote storage, downloading anything that's
|
||||
// missing locally, and scheduling uploads for anything that's missing
|
||||
// in remote storage.
|
||||
timeline
|
||||
.reconcile_with_remote(
|
||||
up_to_date_metadata,
|
||||
remote_startup_data.as_ref().map(|r| &r.index_part),
|
||||
)
|
||||
.await
|
||||
.context("failed to reconcile with remote")?
|
||||
}
|
||||
|
||||
// Sanity check: a timeline should have some content.
|
||||
anyhow::ensure!(
|
||||
ancestor.is_some()
|
||||
@@ -516,6 +479,18 @@ impl Tenant {
|
||||
"Timeline has no ancestor and no layer files"
|
||||
);
|
||||
|
||||
// Save the metadata file to local disk.
|
||||
if !picked_local {
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
up_to_date_metadata,
|
||||
first_save,
|
||||
)
|
||||
.context("save_metadata")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -532,11 +507,9 @@ impl Tenant {
|
||||
pub(crate) fn spawn_attach(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
// TODO dedup with spawn_load
|
||||
@@ -550,9 +523,7 @@ impl Tenant {
|
||||
tenant_conf,
|
||||
wal_redo_manager,
|
||||
tenant_id,
|
||||
generation,
|
||||
Some(remote_storage.clone()),
|
||||
Some(deletion_queue_client),
|
||||
));
|
||||
|
||||
// Do all the hard work in the background
|
||||
@@ -662,8 +633,12 @@ impl Tenant {
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;
|
||||
|
||||
let remote_timeline_ids =
|
||||
remote_timeline_client::list_remote_timelines(remote_storage, self.tenant_id).await?;
|
||||
let remote_timeline_ids = remote_timeline_client::list_remote_timelines(
|
||||
remote_storage,
|
||||
self.conf,
|
||||
self.tenant_id,
|
||||
)
|
||||
.await?;
|
||||
|
||||
info!("found {} timelines", remote_timeline_ids.len());
|
||||
|
||||
@@ -675,7 +650,6 @@ impl Tenant {
|
||||
self.conf,
|
||||
self.tenant_id,
|
||||
timeline_id,
|
||||
self.generation,
|
||||
);
|
||||
part_downloads.spawn(
|
||||
async move {
|
||||
@@ -709,7 +683,10 @@ impl Tenant {
|
||||
debug!("successfully downloaded index part for timeline {timeline_id}");
|
||||
match index_part {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => {
|
||||
timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
|
||||
timeline_ancestors.insert(
|
||||
timeline_id,
|
||||
index_part.parse_metadata().context("parse_metadata")?,
|
||||
);
|
||||
remote_index_and_client.insert(timeline_id, (index_part, client));
|
||||
}
|
||||
MaybeDeletedIndexPart::Deleted(index_part) => {
|
||||
@@ -738,7 +715,6 @@ impl Tenant {
|
||||
remote_metadata,
|
||||
TimelineResources {
|
||||
remote_client: Some(remote_client),
|
||||
deletion_queue_client: self.deletion_queue_client.clone(),
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
@@ -761,9 +737,8 @@ impl Tenant {
|
||||
DeleteTimelineFlow::resume_deletion(
|
||||
Arc::clone(self),
|
||||
timeline_id,
|
||||
&index_part.metadata,
|
||||
&index_part.parse_metadata().context("parse_metadata")?,
|
||||
Some(remote_timeline_client),
|
||||
self.deletion_queue_client.clone(),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
@@ -864,8 +839,6 @@ impl Tenant {
|
||||
TenantConfOpt::default(),
|
||||
wal_redo_manager,
|
||||
tenant_id,
|
||||
Generation::broken(),
|
||||
None,
|
||||
None,
|
||||
))
|
||||
}
|
||||
@@ -883,7 +856,6 @@ impl Tenant {
|
||||
pub(crate) fn spawn_load(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
resources: TenantSharedResources,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
@@ -901,7 +873,6 @@ impl Tenant {
|
||||
|
||||
let broker_client = resources.broker_client;
|
||||
let remote_storage = resources.remote_storage;
|
||||
let deletion_queue_client = resources.deletion_queue_client;
|
||||
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||
let tenant = Tenant::new(
|
||||
@@ -910,9 +881,7 @@ impl Tenant {
|
||||
tenant_conf,
|
||||
wal_redo_manager,
|
||||
tenant_id,
|
||||
generation,
|
||||
remote_storage.clone(),
|
||||
Some(deletion_queue_client),
|
||||
);
|
||||
let tenant = Arc::new(tenant);
|
||||
|
||||
@@ -1320,7 +1289,6 @@ impl Tenant {
|
||||
timeline_id,
|
||||
&local_metadata,
|
||||
Some(remote_client),
|
||||
self.deletion_queue_client.clone(),
|
||||
init_order,
|
||||
)
|
||||
.await
|
||||
@@ -1331,7 +1299,10 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
let remote_metadata = index_part.metadata.clone();
|
||||
let remote_metadata = index_part
|
||||
.parse_metadata()
|
||||
.context("parse_metadata")
|
||||
.map_err(LoadLocalTimelineError::Load)?;
|
||||
(
|
||||
Some(RemoteStartupData {
|
||||
index_part,
|
||||
@@ -1370,7 +1341,6 @@ impl Tenant {
|
||||
timeline_id,
|
||||
&local_metadata,
|
||||
None,
|
||||
None,
|
||||
init_order,
|
||||
)
|
||||
.await
|
||||
@@ -2295,7 +2265,6 @@ impl Tenant {
|
||||
ancestor,
|
||||
new_timeline_id,
|
||||
self.tenant_id,
|
||||
self.generation,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
resources,
|
||||
pg_version,
|
||||
@@ -2313,18 +2282,8 @@ impl Tenant {
|
||||
tenant_conf: TenantConfOpt,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
deletion_queue_client: Option<DeletionQueueClient>,
|
||||
) -> Tenant {
|
||||
#[cfg(not(test))]
|
||||
match state {
|
||||
TenantState::Broken { .. } => {}
|
||||
_ => {
|
||||
// Non-broken tenants must be constructed with a deletion queue
|
||||
assert!(deletion_queue_client.is_some());
|
||||
}
|
||||
}
|
||||
let (state, mut rx) = watch::channel(state);
|
||||
|
||||
tokio::spawn(async move {
|
||||
@@ -2381,7 +2340,6 @@ impl Tenant {
|
||||
|
||||
Tenant {
|
||||
tenant_id,
|
||||
generation,
|
||||
conf,
|
||||
// using now here is good enough approximation to catch tenants with really long
|
||||
// activation times.
|
||||
@@ -2391,7 +2349,6 @@ impl Tenant {
|
||||
gc_cs: tokio::sync::Mutex::new(()),
|
||||
walredo_mgr,
|
||||
remote_storage,
|
||||
deletion_queue_client,
|
||||
state,
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
@@ -2965,17 +2922,13 @@ impl Tenant {
|
||||
self.conf,
|
||||
self.tenant_id,
|
||||
timeline_id,
|
||||
self.generation,
|
||||
);
|
||||
Some(remote_client)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
deletion_queue_client: self.deletion_queue_client.clone(),
|
||||
}
|
||||
TimelineResources { remote_client }
|
||||
}
|
||||
|
||||
/// Creates intermediate timeline structure and its files.
|
||||
@@ -3492,7 +3445,6 @@ pub mod harness {
|
||||
pub conf: &'static PageServerConf,
|
||||
pub tenant_conf: TenantConf,
|
||||
pub tenant_id: TenantId,
|
||||
pub generation: Generation,
|
||||
}
|
||||
|
||||
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
|
||||
@@ -3534,14 +3486,13 @@ pub mod harness {
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
generation: Generation::new(0xdeadbeef),
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
(
|
||||
self.try_load(&ctx, None, None)
|
||||
self.try_load(&ctx, None)
|
||||
.await
|
||||
.expect("failed to load test tenant"),
|
||||
ctx,
|
||||
@@ -3552,7 +3503,6 @@ pub mod harness {
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
remote_storage: Option<remote_storage::GenericRemoteStorage>,
|
||||
deletion_queue_client: Option<DeletionQueueClient>,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
@@ -3562,9 +3512,7 @@ pub mod harness {
|
||||
TenantConfOpt::from(self.tenant_conf),
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
self.generation,
|
||||
remote_storage,
|
||||
deletion_queue_client,
|
||||
));
|
||||
tenant
|
||||
.load(None, ctx)
|
||||
@@ -4129,7 +4077,7 @@ mod tests {
|
||||
std::fs::write(metadata_path, metadata_bytes)?;
|
||||
|
||||
let err = harness
|
||||
.try_load(&ctx, None, None)
|
||||
.try_load(&ctx, None)
|
||||
.await
|
||||
.err()
|
||||
.expect("should fail");
|
||||
@@ -4144,7 +4092,7 @@ mod tests {
|
||||
let mut found_error_message = false;
|
||||
let mut err_source = err.source();
|
||||
while let Some(source) = err_source {
|
||||
if source.to_string().contains("metadata checksum mismatch") {
|
||||
if source.to_string() == "metadata checksum mismatch" {
|
||||
found_error_message = true;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -12,11 +12,14 @@
|
||||
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||
//!
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockReader};
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
|
||||
impl<'a> BlockCursor<'a> {
|
||||
impl<R> BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
/// Read a blob into a new buffer.
|
||||
pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
|
||||
let mut buf = Vec::new();
|
||||
@@ -33,7 +36,7 @@ impl<'a> BlockCursor<'a> {
|
||||
let mut blknum = (offset / PAGE_SZ as u64) as u32;
|
||||
let mut off = (offset % PAGE_SZ as u64) as usize;
|
||||
|
||||
let mut buf = self.read_blk(blknum).await?;
|
||||
let mut buf = self.read_blk(blknum)?;
|
||||
|
||||
// peek at the first byte, to determine if it's a 1- or 4-byte length
|
||||
let first_len_byte = buf[off];
|
||||
@@ -49,7 +52,7 @@ impl<'a> BlockCursor<'a> {
|
||||
// it is split across two pages
|
||||
len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
|
||||
blknum += 1;
|
||||
buf = self.read_blk(blknum).await?;
|
||||
buf = self.read_blk(blknum)?;
|
||||
len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
@@ -70,7 +73,7 @@ impl<'a> BlockCursor<'a> {
|
||||
if page_remain == 0 {
|
||||
// continue on next page
|
||||
blknum += 1;
|
||||
buf = self.read_blk(blknum).await?;
|
||||
buf = self.read_blk(blknum)?;
|
||||
off = 0;
|
||||
page_remain = PAGE_SZ;
|
||||
}
|
||||
|
||||
@@ -2,12 +2,8 @@
|
||||
//! Low-level Block-oriented I/O functions
|
||||
//!
|
||||
|
||||
use super::ephemeral_file::EphemeralFile;
|
||||
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
|
||||
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use bytes::Bytes;
|
||||
use std::fs::File;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
@@ -17,20 +13,32 @@ use std::os::unix::fs::FileExt;
|
||||
/// There are currently two implementations: EphemeralFile, and FileBlockReader
|
||||
/// below.
|
||||
pub trait BlockReader {
|
||||
///
|
||||
/// Read a block. Returns a "lease" object that can be used to
|
||||
/// access to the contents of the page. (For the page cache, the
|
||||
/// lease object represents a lock on the buffer.)
|
||||
///
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;
|
||||
|
||||
///
|
||||
/// Create a new "cursor" for reading from this reader.
|
||||
///
|
||||
/// A cursor caches the last accessed page, allowing for faster
|
||||
/// access if the same block is accessed repeatedly.
|
||||
fn block_cursor(&self) -> BlockCursor<'_>;
|
||||
fn block_cursor(&self) -> BlockCursor<&Self>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
BlockCursor::new(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> BlockReader for &B
|
||||
where
|
||||
B: BlockReader,
|
||||
{
|
||||
fn block_cursor(&self) -> BlockCursor<'_> {
|
||||
(*self).block_cursor()
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
(*self).read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,7 +47,7 @@ pub enum BlockLease<'a> {
|
||||
PageReadGuard(PageReadGuard<'static>),
|
||||
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
|
||||
#[cfg(test)]
|
||||
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
|
||||
Rc(std::rc::Rc<[u8; PAGE_SZ]>),
|
||||
}
|
||||
|
||||
impl From<PageReadGuard<'static>> for BlockLease<'static> {
|
||||
@@ -49,9 +57,9 @@ impl From<PageReadGuard<'static>> for BlockLease<'static> {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
|
||||
fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
|
||||
BlockLease::Arc(value)
|
||||
impl<'a> From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease<'a> {
|
||||
fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
|
||||
BlockLease::Rc(value)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,35 +71,7 @@ impl<'a> Deref for BlockLease<'a> {
|
||||
BlockLease::PageReadGuard(v) => v.deref(),
|
||||
BlockLease::EphemeralFileMutableTail(v) => v,
|
||||
#[cfg(test)]
|
||||
BlockLease::Arc(v) => v.deref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Provides the ability to read blocks from different sources,
|
||||
/// similar to using traits for this purpose.
|
||||
///
|
||||
/// Unlike traits, we also support the read function to be async though.
|
||||
pub(crate) enum BlockReaderRef<'a> {
|
||||
FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
|
||||
FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
|
||||
EphemeralFile(&'a EphemeralFile),
|
||||
Adapter(Adapter<&'a DeltaLayerInner>),
|
||||
#[cfg(test)]
|
||||
TestDisk(&'a super::disk_btree::tests::TestDisk),
|
||||
}
|
||||
|
||||
impl<'a> BlockReaderRef<'a> {
|
||||
#[inline(always)]
|
||||
async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
use BlockReaderRef::*;
|
||||
match self {
|
||||
FileBlockReaderVirtual(r) => r.read_blk(blknum).await,
|
||||
FileBlockReaderFile(r) => r.read_blk(blknum).await,
|
||||
EphemeralFile(r) => r.read_blk(blknum).await,
|
||||
Adapter(r) => r.read_blk(blknum).await,
|
||||
#[cfg(test)]
|
||||
TestDisk(r) => r.read_blk(blknum),
|
||||
BlockLease::Rc(v) => v.deref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -113,29 +93,23 @@ impl<'a> BlockReaderRef<'a> {
|
||||
/// // do stuff with 'buf'
|
||||
/// ```
|
||||
///
|
||||
pub struct BlockCursor<'a> {
|
||||
reader: BlockReaderRef<'a>,
|
||||
pub struct BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
reader: R,
|
||||
}
|
||||
|
||||
impl<'a> BlockCursor<'a> {
|
||||
pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
|
||||
impl<R> BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
pub fn new(reader: R) -> Self {
|
||||
BlockCursor { reader }
|
||||
}
|
||||
// Needed by cli
|
||||
pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
|
||||
BlockCursor {
|
||||
reader: BlockReaderRef::FileBlockReaderVirtual(reader),
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a block.
|
||||
///
|
||||
/// Returns a "lease" object that can be used to
|
||||
/// access to the contents of the page. (For the page cache, the
|
||||
/// lease object represents a lock on the buffer.)
|
||||
#[inline(always)]
|
||||
pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
self.reader.read_blk(blknum).await
|
||||
pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
self.reader.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -165,17 +139,17 @@ where
|
||||
assert!(buf.len() == PAGE_SZ);
|
||||
self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
|
||||
}
|
||||
/// Read a block.
|
||||
///
|
||||
/// Returns a "lease" object that can be used to
|
||||
/// access to the contents of the page. (For the page cache, the
|
||||
/// lease object represents a lock on the buffer.)
|
||||
pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
}
|
||||
|
||||
impl<F> BlockReader for FileBlockReader<F>
|
||||
where
|
||||
F: FileExt,
|
||||
{
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
.read_immutable_buf(self.file_id, blknum)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
@@ -196,18 +170,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockReader for FileBlockReader<File> {
|
||||
fn block_cursor(&self) -> BlockCursor<'_> {
|
||||
BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockReader for FileBlockReader<VirtualFile> {
|
||||
fn block_cursor(&self) -> BlockCursor<'_> {
|
||||
BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Trait for block-oriented output
|
||||
///
|
||||
|
||||
@@ -7,7 +7,6 @@ use anyhow::Context;
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, instrument, warn, Instrument, Span};
|
||||
|
||||
use utils::{
|
||||
@@ -83,8 +82,6 @@ async fn create_remote_delete_mark(
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"mark_upload",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await
|
||||
.context("mark_upload")?;
|
||||
@@ -174,8 +171,6 @@ async fn remove_tenant_remote_delete_mark(
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
@@ -257,8 +252,6 @@ pub(crate) async fn remote_delete_mark_exists(
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
"fetch_tenant_deletion_mark",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
||||
@@ -259,10 +259,9 @@ where
|
||||
{
|
||||
let mut stack = Vec::new();
|
||||
stack.push((self.root_blk, None));
|
||||
let block_cursor = self.reader.block_cursor();
|
||||
while let Some((node_blknum, opt_iter)) = stack.pop() {
|
||||
// Locate the node.
|
||||
let node_buf = block_cursor.read_blk(self.start_blk + node_blknum).await?;
|
||||
let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
|
||||
|
||||
let node = OnDiskNode::deparse(node_buf.as_ref())?;
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
@@ -354,10 +353,8 @@ where
|
||||
|
||||
stack.push((self.root_blk, String::new(), 0, 0, 0));
|
||||
|
||||
let block_cursor = self.reader.block_cursor();
|
||||
|
||||
while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
|
||||
let blk = block_cursor.read_blk(self.start_blk + blknum).await?;
|
||||
let blk = self.reader.read_blk(self.start_blk + blknum)?;
|
||||
let buf: &[u8] = blk.as_ref();
|
||||
let node = OnDiskNode::<L>::deparse(buf)?;
|
||||
|
||||
@@ -686,30 +683,27 @@ impl<const L: usize> BuildNode<L> {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
|
||||
use crate::tenant::block_io::BlockLease;
|
||||
use rand::Rng;
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub(crate) struct TestDisk {
|
||||
struct TestDisk {
|
||||
blocks: Vec<Bytes>,
|
||||
}
|
||||
impl TestDisk {
|
||||
fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
pub(crate) fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
|
||||
let mut buf = [0u8; PAGE_SZ];
|
||||
buf.copy_from_slice(&self.blocks[blknum as usize]);
|
||||
Ok(std::sync::Arc::new(buf).into())
|
||||
}
|
||||
}
|
||||
impl BlockReader for TestDisk {
|
||||
fn block_cursor(&self) -> BlockCursor<'_> {
|
||||
BlockCursor::new(BlockReaderRef::TestDisk(self))
|
||||
fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
|
||||
let mut buf = [0u8; PAGE_SZ];
|
||||
buf.copy_from_slice(&self.blocks[blknum as usize]);
|
||||
Ok(std::rc::Rc::new(buf).into())
|
||||
}
|
||||
}
|
||||
impl BlockWriter for &mut TestDisk {
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockLease, BlockReader};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use std::cmp::min;
|
||||
use std::fs::OpenOptions;
|
||||
@@ -21,7 +22,7 @@ pub struct EphemeralFile {
|
||||
_tenant_id: TenantId,
|
||||
_timeline_id: TimelineId,
|
||||
file: VirtualFile,
|
||||
len: u64,
|
||||
size: u64,
|
||||
/// An ephemeral file is append-only.
|
||||
/// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
|
||||
/// The other pages, which can no longer be modified, are accessed through the page cache.
|
||||
@@ -52,57 +53,27 @@ impl EphemeralFile {
|
||||
_tenant_id: tenant_id,
|
||||
_timeline_id: timeline_id,
|
||||
file,
|
||||
len: 0,
|
||||
size: 0,
|
||||
mutable_tail: [0u8; PAGE_SZ],
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn len(&self) -> u64 {
|
||||
self.len
|
||||
pub(crate) fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
|
||||
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
|
||||
if flushed_blknums.contains(&(blknum as u64)) {
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
// order path before error because error is anyhow::Error => might have many contexts
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum,
|
||||
self.file.path.display(),
|
||||
e,
|
||||
),
|
||||
)
|
||||
})? {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
self.file
|
||||
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// Swap for read lock
|
||||
continue;
|
||||
}
|
||||
};
|
||||
}
|
||||
} else {
|
||||
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
|
||||
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
|
||||
}
|
||||
/// Does the given filename look like an ephemeral file?
|
||||
pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||
if let Some(rest) = filename.strip_prefix("ephemeral-") {
|
||||
rest.parse::<u32>().is_ok()
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
|
||||
impl BlobWriter for EphemeralFile {
|
||||
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
|
||||
struct Writer<'a> {
|
||||
ephemeral_file: &'a mut EphemeralFile,
|
||||
/// The block to which the next [`push_bytes`] will write.
|
||||
@@ -113,13 +84,13 @@ impl EphemeralFile {
|
||||
impl<'a> Writer<'a> {
|
||||
fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
|
||||
Ok(Writer {
|
||||
blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
|
||||
off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
|
||||
blknum: (ephemeral_file.size / PAGE_SZ as u64) as u32,
|
||||
off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
|
||||
ephemeral_file,
|
||||
})
|
||||
}
|
||||
#[inline(always)]
|
||||
async fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
|
||||
fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
|
||||
let mut src_remaining = src;
|
||||
while !src_remaining.is_empty() {
|
||||
let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
|
||||
@@ -136,13 +107,10 @@ impl EphemeralFile {
|
||||
// Pre-warm the page cache with what we just wrote.
|
||||
// This isn't necessary for coherency/correctness, but it's how we've always done it.
|
||||
let cache = page_cache::get();
|
||||
match cache
|
||||
.read_immutable_buf(
|
||||
self.ephemeral_file.page_cache_file_id,
|
||||
self.blknum,
|
||||
)
|
||||
.await
|
||||
{
|
||||
match cache.read_immutable_buf(
|
||||
self.ephemeral_file.page_cache_file_id,
|
||||
self.blknum,
|
||||
) {
|
||||
Ok(page_cache::ReadBufResult::Found(_guard)) => {
|
||||
// This function takes &mut self, so, it shouldn't be possible to reach this point.
|
||||
unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
|
||||
@@ -186,47 +154,39 @@ impl EphemeralFile {
|
||||
}
|
||||
}
|
||||
|
||||
let pos = self.len;
|
||||
let pos = self.size;
|
||||
let mut writer = Writer::new(self)?;
|
||||
|
||||
// Write the length field
|
||||
if srcbuf.len() < 0x80 {
|
||||
// short one-byte length header
|
||||
let len_buf = [srcbuf.len() as u8];
|
||||
writer.push_bytes(&len_buf).await?;
|
||||
writer.push_bytes(&len_buf)?;
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
writer.push_bytes(&len_buf).await?;
|
||||
writer.push_bytes(&len_buf)?;
|
||||
}
|
||||
|
||||
// Write the payload
|
||||
writer.push_bytes(srcbuf).await?;
|
||||
writer.push_bytes(srcbuf)?;
|
||||
|
||||
if srcbuf.len() < 0x80 {
|
||||
self.len += 1;
|
||||
self.size += 1;
|
||||
} else {
|
||||
self.len += 4;
|
||||
self.size += 4;
|
||||
}
|
||||
self.len += srcbuf.len() as u64;
|
||||
self.size += srcbuf.len() as u64;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
}
|
||||
|
||||
/// Does the given filename look like an ephemeral file?
|
||||
pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||
if let Some(rest) = filename.strip_prefix("ephemeral-") {
|
||||
rest.parse::<u32>().is_ok()
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for EphemeralFile {
|
||||
fn drop(&mut self) {
|
||||
// There might still be pages in the [`crate::page_cache`] for this file.
|
||||
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
|
||||
// drop all pages from page cache
|
||||
let cache = page_cache::get();
|
||||
cache.drop_buffers_for_immutable(self.page_cache_file_id);
|
||||
|
||||
// unlink the file
|
||||
let res = std::fs::remove_file(&self.file.path);
|
||||
@@ -247,15 +207,52 @@ impl Drop for EphemeralFile {
|
||||
}
|
||||
|
||||
impl BlockReader for EphemeralFile {
|
||||
fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
|
||||
BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
|
||||
let flushed_blknums = 0..self.size / PAGE_SZ as u64;
|
||||
if flushed_blknums.contains(&(blknum as u64)) {
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum)
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
// order path before error because error is anyhow::Error => might have many contexts
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum,
|
||||
self.file.path.display(),
|
||||
e,
|
||||
),
|
||||
)
|
||||
})? {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
self.file
|
||||
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// Swap for read lock
|
||||
continue;
|
||||
}
|
||||
};
|
||||
}
|
||||
} else {
|
||||
debug_assert_eq!(blknum as u64, self.size / PAGE_SZ as u64);
|
||||
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
use rand::{thread_rng, RngCore};
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
@@ -283,12 +280,12 @@ mod tests {
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
|
||||
let pos_foo = file.write_blob(b"foo").await?;
|
||||
let pos_foo = file.write_blob(b"foo")?;
|
||||
assert_eq!(
|
||||
b"foo",
|
||||
file.block_cursor().read_blob(pos_foo).await?.as_slice()
|
||||
);
|
||||
let pos_bar = file.write_blob(b"bar").await?;
|
||||
let pos_bar = file.write_blob(b"bar")?;
|
||||
assert_eq!(
|
||||
b"foo",
|
||||
file.block_cursor().read_blob(pos_foo).await?.as_slice()
|
||||
@@ -301,17 +298,17 @@ mod tests {
|
||||
let mut blobs = Vec::new();
|
||||
for i in 0..10000 {
|
||||
let data = Vec::from(format!("blob{}", i).as_bytes());
|
||||
let pos = file.write_blob(&data).await?;
|
||||
let pos = file.write_blob(&data)?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
// also test with a large blobs
|
||||
for i in 0..100 {
|
||||
let data = format!("blob{}", i).as_bytes().repeat(100);
|
||||
let pos = file.write_blob(&data).await?;
|
||||
let pos = file.write_blob(&data)?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
|
||||
let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
|
||||
let cursor = BlockCursor::new(&file);
|
||||
for (pos, expected) in blobs {
|
||||
let actual = cursor.read_blob(pos).await?;
|
||||
assert_eq!(actual, expected);
|
||||
@@ -321,7 +318,7 @@ mod tests {
|
||||
let mut large_data = Vec::new();
|
||||
large_data.resize(20000, 0);
|
||||
thread_rng().fill_bytes(&mut large_data);
|
||||
let pos_large = file.write_blob(&large_data).await?;
|
||||
let pos_large = file.write_blob(&large_data)?;
|
||||
let result = file.block_cursor().read_blob(pos_large).await?;
|
||||
assert_eq!(result, large_data);
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ use std::fs::{File, OpenOptions};
|
||||
use std::io::{self, Write};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use serde::{de::Error, Deserialize, Serialize, Serializer};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
use tracing::info_span;
|
||||
use utils::bin_ser::SerializeError;
|
||||
@@ -232,28 +232,6 @@ impl TimelineMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for TimelineMetadata {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let bytes = Vec::<u8>::deserialize(deserializer)?;
|
||||
Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for TimelineMetadata {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
let bytes = self
|
||||
.to_bytes()
|
||||
.map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
|
||||
bytes.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
/// Save timeline metadata to file
|
||||
pub fn save_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
@@ -1,13 +1,10 @@
|
||||
//! This module acts as a switchboard to access different repositories managed by this
|
||||
//! page server.
|
||||
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::control_api::{HexTenantId, ReAttachRequest, ReAttachResponse};
|
||||
use std::collections::{hash_map, HashMap};
|
||||
use std::ffi::OsStr;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::fs;
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -21,7 +18,6 @@ use utils::crashsafe;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueue;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
@@ -29,7 +25,6 @@ use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantSt
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
|
||||
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTenantError;
|
||||
@@ -80,78 +75,6 @@ pub async fn init_tenant_mgr(
|
||||
|
||||
let mut tenants = HashMap::new();
|
||||
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what to attach
|
||||
let tenant_generations = conf
|
||||
.control_plane_api
|
||||
.as_ref()
|
||||
.map(|control_plane_api| async {
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client");
|
||||
|
||||
// FIXME: it's awkward that join() requires the base to have a trailing slash, makes
|
||||
// it easy to get a config wrong
|
||||
assert!(
|
||||
control_plane_api.as_str().ends_with("/"),
|
||||
"control plane API needs trailing slash"
|
||||
);
|
||||
|
||||
let re_attach_path = control_plane_api
|
||||
.join("re-attach")
|
||||
.expect("Failed to build re-attach path");
|
||||
let request = ReAttachRequest { node_id: conf.id };
|
||||
|
||||
// TODO: we should have been passed a cancellation token, and use it to end
|
||||
// this loop gracefully
|
||||
loop {
|
||||
let response = match client
|
||||
.post(re_attach_path.clone())
|
||||
.json(&request)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Err(e) => Err(anyhow::Error::from(e)),
|
||||
Ok(r) => {
|
||||
if r.status() == StatusCode::OK {
|
||||
r.json::<ReAttachResponse>()
|
||||
.await
|
||||
.map_err(|e| anyhow::Error::from(e))
|
||||
} else {
|
||||
Err(anyhow::anyhow!("Unexpected status {}", r.status()))
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
match response {
|
||||
Ok(res) => {
|
||||
tracing::info!(
|
||||
"Received re-attach response with {0} tenants",
|
||||
res.tenants.len()
|
||||
);
|
||||
|
||||
// TODO: do something with it
|
||||
break res
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|t| (t.id, t.generation))
|
||||
.collect::<HashMap<_, _>>();
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Error re-attaching tenants, retrying: {e:#}");
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let tenant_generations = match tenant_generations {
|
||||
Some(g) => Some(g.await),
|
||||
None => {
|
||||
info!("Control plane API not configured, tenant generations are disabled");
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let mut dir_entries = fs::read_dir(&tenants_dir)
|
||||
.await
|
||||
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
|
||||
@@ -199,53 +122,9 @@ pub async fn init_tenant_mgr(
|
||||
continue;
|
||||
}
|
||||
|
||||
let tenant_id = match tenant_dir_path
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TenantId>()
|
||||
{
|
||||
Ok(id) => id,
|
||||
Err(_) => {
|
||||
warn!(
|
||||
"Invalid tenant path (garbage in our repo directory?): {0}",
|
||||
tenant_dir_path.display()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let generation = if let Some(generations) = &tenant_generations {
|
||||
// We have a generation map: treat it as the authority for whether
|
||||
// this tenant is really attached.
|
||||
if let Some(gen) = generations.get(&HexTenantId::new(tenant_id)) {
|
||||
Generation::new(*gen)
|
||||
} else {
|
||||
info!("Detaching tenant {0}, control plane omitted it in re-attach response", tenant_id);
|
||||
if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
|
||||
error!(
|
||||
"Failed to remove detached tenant directory '{}': {:?}",
|
||||
tenant_dir_path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// Legacy mode: no generation information, any tenant present
|
||||
// on local disk may activate
|
||||
info!(
|
||||
"Starting tenant {0} in legacy mode, no generation",
|
||||
tenant_dir_path.display()
|
||||
);
|
||||
Generation::none()
|
||||
};
|
||||
|
||||
match schedule_local_tenant_processing(
|
||||
conf,
|
||||
tenant_id,
|
||||
&tenant_dir_path,
|
||||
generation,
|
||||
resources.clone(),
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
@@ -281,9 +160,7 @@ pub async fn init_tenant_mgr(
|
||||
|
||||
pub(crate) fn schedule_local_tenant_processing(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
tenant_path: &Path,
|
||||
generation: Generation,
|
||||
resources: TenantSharedResources,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
@@ -304,6 +181,15 @@ pub(crate) fn schedule_local_tenant_processing(
|
||||
"Cannot load tenant from empty directory {tenant_path:?}"
|
||||
);
|
||||
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TenantId>()
|
||||
.with_context(|| {
|
||||
format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
|
||||
})?;
|
||||
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||
anyhow::ensure!(
|
||||
!conf.tenant_ignore_mark_file_path(&tenant_id).exists(),
|
||||
@@ -316,11 +202,9 @@ pub(crate) fn schedule_local_tenant_processing(
|
||||
match Tenant::spawn_attach(
|
||||
conf,
|
||||
tenant_id,
|
||||
generation,
|
||||
resources.broker_client,
|
||||
tenants,
|
||||
remote_storage,
|
||||
resources.deletion_queue_client,
|
||||
ctx,
|
||||
) {
|
||||
Ok(tenant) => tenant,
|
||||
@@ -340,9 +224,7 @@ pub(crate) fn schedule_local_tenant_processing(
|
||||
} else {
|
||||
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
||||
// Start loading the tenant into memory. It will initially be in Loading state.
|
||||
Tenant::spawn_load(
|
||||
conf, tenant_id, generation, resources, init_order, tenants, ctx,
|
||||
)
|
||||
Tenant::spawn_load(conf, tenant_id, resources, init_order, tenants, ctx)
|
||||
};
|
||||
Ok(tenant)
|
||||
}
|
||||
@@ -465,10 +347,8 @@ pub async fn create_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
deletion_queue: &DeletionQueue,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Tenant>, TenantMapInsertError> {
|
||||
tenant_map_insert(tenant_id, || {
|
||||
@@ -482,11 +362,9 @@ pub async fn create_tenant(
|
||||
let tenant_resources = TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage,
|
||||
deletion_queue_client: deletion_queue.new_client(),
|
||||
};
|
||||
let created_tenant =
|
||||
schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
|
||||
generation, tenant_resources, None, &TENANTS, ctx)?;
|
||||
schedule_local_tenant_processing(conf, &tenant_directory, tenant_resources, None, &TENANTS, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -635,7 +513,6 @@ pub async fn load_tenant(
|
||||
tenant_id: TenantId,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
deletion_queue: &DeletionQueue,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
tenant_map_insert(tenant_id, || {
|
||||
@@ -649,11 +526,8 @@ pub async fn load_tenant(
|
||||
let resources = TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage,
|
||||
deletion_queue_client: deletion_queue.new_client(),
|
||||
};
|
||||
// TODO: remove the `/load` API once generation support is complete:
|
||||
// it becomes equivalent to attaching.
|
||||
let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, Generation::none(), resources, None, &TENANTS, ctx)
|
||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, resources, None, &TENANTS, ctx)
|
||||
.with_context(|| {
|
||||
format!("Failed to schedule tenant processing in path {tenant_path:?}")
|
||||
})?;
|
||||
@@ -717,11 +591,9 @@ pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapLis
|
||||
pub async fn attach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
tenant_conf: TenantConfOpt,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
deletion_queue: &DeletionQueue,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
tenant_map_insert(tenant_id, || {
|
||||
@@ -739,9 +611,8 @@ pub async fn attach_tenant(
|
||||
let resources = TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage: Some(remote_storage),
|
||||
deletion_queue_client: deletion_queue.new_client(),
|
||||
};
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, resources, None, &TENANTS, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
|
||||
@@ -56,11 +56,9 @@
|
||||
//! # Consistency
|
||||
//!
|
||||
//! To have a consistent remote structure, it's important that uploads and
|
||||
//! deletions are performed in the right order. For example:
|
||||
//! - the index file contains a list of layer files, so it must not be uploaded
|
||||
//! until all the layer files that are in its list have been successfully uploaded.
|
||||
//! - objects must be removed from the index before being deleted, and that updated
|
||||
//! index must be written to remote storage before deleting the objects from remote storage.
|
||||
//! deletions are performed in the right order. For example, the index file
|
||||
//! contains a list of layer files, so it must not be uploaded until all the
|
||||
//! layer files that are in its list have been successfully uploaded.
|
||||
//!
|
||||
//! The contract between client and its user is that the user is responsible of
|
||||
//! scheduling operations in an order that keeps the remote consistent as
|
||||
@@ -72,12 +70,10 @@
|
||||
//! correct order, and the client will parallelize the operations in a way that
|
||||
//! is safe.
|
||||
//!
|
||||
//! The caller should be careful with deletion, though:
|
||||
//! - they should not delete local files that have been scheduled for upload but
|
||||
//! not yet finished uploading. Otherwise the upload will fail. To wait for an
|
||||
//! upload to finish, use the 'wait_completion' function (more on that later.)
|
||||
//! - they should not to remote deletions via DeletionQueue without waiting for
|
||||
//! the latest metadata to upload via RemoteTimelineClient.
|
||||
//! The caller should be careful with deletion, though. They should not delete
|
||||
//! local files that have been scheduled for upload but not yet finished uploading.
|
||||
//! Otherwise the upload will fail. To wait for an upload to finish, use
|
||||
//! the 'wait_completion' function (more on that later.)
|
||||
//!
|
||||
//! All of this relies on the following invariants:
|
||||
//!
|
||||
@@ -139,7 +135,7 @@
|
||||
//! - Initiate upload queue with that [`IndexPart`].
|
||||
//! - Reschedule all lost operations by comparing the local filesystem state
|
||||
//! and remote state as per [`IndexPart`]. This is done in
|
||||
//! [`Tenant::timeline_init_and_sync`].
|
||||
//! [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
|
||||
//!
|
||||
//! Note that if we crash during file deletion between the index update
|
||||
//! that removes the file from the list of files, and deleting the remote file,
|
||||
@@ -176,6 +172,7 @@
|
||||
//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
|
||||
//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
|
||||
//!
|
||||
//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
|
||||
//! We keep track of the fact that a client is in `Attaching` state in a marker
|
||||
//! file on the local disk. This is critical because, when we restart the pageserver,
|
||||
//! we do not want to do the `List timelines` step for each tenant that has already
|
||||
@@ -195,31 +192,31 @@
|
||||
//! not created and the uploads are skipped.
|
||||
//! Theoretically, it should be ok to remove and re-add remote storage configuration to
|
||||
//! the pageserver config at any time, since it doesn't make a difference to
|
||||
//! [`Timeline::load_layer_map`].
|
||||
//! `reconcile_with_remote`.
|
||||
//! Of course, the remote timeline dir must not change while we have de-configured
|
||||
//! remote storage, i.e., the pageserver must remain the owner of the given prefix
|
||||
//! in remote storage.
|
||||
//! But note that we don't test any of this right now.
|
||||
//!
|
||||
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
|
||||
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
|
||||
//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote
|
||||
|
||||
mod delete;
|
||||
mod download;
|
||||
pub mod index;
|
||||
mod upload;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
// re-export these
|
||||
pub use download::{is_temp_download_file, list_remote_timelines};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff::{
|
||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::Path;
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
@@ -229,15 +226,14 @@ use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::metrics::{
|
||||
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
|
||||
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
||||
REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
|
||||
};
|
||||
use crate::task_mgr::shutdown_token;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::upload_queue::Delete;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr,
|
||||
@@ -247,7 +243,6 @@ use crate::{
|
||||
tenant::upload_queue::{
|
||||
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
|
||||
},
|
||||
tenant::TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -256,7 +251,6 @@ use self::index::IndexPart;
|
||||
|
||||
use super::storage_layer::LayerFileName;
|
||||
use super::upload_queue::SetDeletedFlagProgress;
|
||||
use super::Generation;
|
||||
|
||||
// Occasional network issues and such can cause remote operations to fail, and
|
||||
// that's expected. If a download fails, we log it at info-level, and retry.
|
||||
@@ -320,7 +314,6 @@ pub struct RemoteTimelineClient {
|
||||
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
generation: Generation,
|
||||
|
||||
upload_queue: Mutex<UploadQueue>,
|
||||
|
||||
@@ -341,14 +334,12 @@ impl RemoteTimelineClient {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
generation: Generation,
|
||||
) -> RemoteTimelineClient {
|
||||
RemoteTimelineClient {
|
||||
conf,
|
||||
runtime: BACKGROUND_RUNTIME.handle().to_owned(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
generation,
|
||||
storage_impl: remote_storage,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
|
||||
@@ -362,10 +353,6 @@ impl RemoteTimelineClient {
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
info!(
|
||||
"initialized upload queue from remote index with {} layer files",
|
||||
index_part.layer_metadata.len()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -378,7 +365,6 @@ impl RemoteTimelineClient {
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_empty_remote(local_metadata)?;
|
||||
self.update_remote_physical_size_gauge(None);
|
||||
info!("initialized upload queue as empty");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -461,7 +447,6 @@ impl RemoteTimelineClient {
|
||||
&self.storage_impl,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
self.generation,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
@@ -550,7 +535,8 @@ impl RemoteTimelineClient {
|
||||
// ahead of what's _actually_ on the remote during index upload.
|
||||
upload_queue.latest_metadata = metadata.clone();
|
||||
|
||||
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
self.schedule_index_upload(upload_queue, metadata_bytes);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -570,7 +556,8 @@ impl RemoteTimelineClient {
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
self.schedule_index_upload(upload_queue, metadata_bytes);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -580,7 +567,7 @@ impl RemoteTimelineClient {
|
||||
fn schedule_index_upload(
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
metadata: TimelineMetadata,
|
||||
metadata_bytes: Vec<u8>,
|
||||
) {
|
||||
info!(
|
||||
"scheduling metadata upload with {} files ({} changed)",
|
||||
@@ -593,7 +580,7 @@ impl RemoteTimelineClient {
|
||||
let index_part = IndexPart::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
metadata,
|
||||
metadata_bytes,
|
||||
);
|
||||
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
@@ -640,66 +627,50 @@ impl RemoteTimelineClient {
|
||||
/// deletion won't actually be performed, until any previously scheduled
|
||||
/// upload operations, and the index file upload, have completed
|
||||
/// successfully.
|
||||
pub async fn schedule_layer_file_deletion(
|
||||
pub fn schedule_layer_file_deletion(
|
||||
self: &Arc<Self>,
|
||||
names: &[LayerFileName],
|
||||
deletion_queue_client: &DeletionQueueClient,
|
||||
) -> anyhow::Result<()> {
|
||||
// Synchronous update of upload queues under mutex
|
||||
let with_generations = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
// Deleting layers doesn't affect the values stored in TimelineMetadata,
|
||||
// so we don't need update it. Just serialize it.
|
||||
let metadata = upload_queue.latest_metadata.clone();
|
||||
// Deleting layers doesn't affect the values stored in TimelineMetadata,
|
||||
// so we don't need update it. Just serialize it.
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
|
||||
// Decorate our list of names with each name's generation, dropping
|
||||
// makes that are unexpectedly missing from our metadata.
|
||||
let with_generations: Vec<_> = names
|
||||
.into_iter()
|
||||
.filter_map(|name| {
|
||||
// Remove from latest_files, learning the file's remote generation in the process
|
||||
let meta = upload_queue.latest_files.remove(name);
|
||||
|
||||
if let Some(meta) = meta {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
Some((name.clone(), meta.generation))
|
||||
} else {
|
||||
// This is unexpected: latest_files is meant to be kept up to
|
||||
// date. We can't delete the layer if we have forgotten what
|
||||
// generation it was in.
|
||||
warn!("Deleting layer {name} not found in latest_files list");
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue, metadata);
|
||||
// Update the remote index file, removing the to-be-deleted files from the index,
|
||||
// before deleting the actual files.
|
||||
//
|
||||
// Once we start removing files from upload_queue.latest_files, there's
|
||||
// no going back! Otherwise, some of the files would already be removed
|
||||
// from latest_files, but not yet scheduled for deletion. Use a closure
|
||||
// to syntactically forbid ? or bail! calls here.
|
||||
let no_bail_here = || {
|
||||
for name in names {
|
||||
upload_queue.latest_files.remove(name);
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
}
|
||||
|
||||
with_generations
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue, metadata_bytes);
|
||||
}
|
||||
|
||||
// schedule the actual deletions
|
||||
for name in names {
|
||||
let op = UploadOp::Delete(Delete {
|
||||
file_kind: RemoteOpFileKind::Layer,
|
||||
layer_file_name: name.clone(),
|
||||
scheduled_from_timeline_delete: false,
|
||||
});
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
info!("scheduled layer file deletion {name}");
|
||||
}
|
||||
|
||||
// Launch the tasks immediately, if possible
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
};
|
||||
|
||||
// Barrier: we must ensure all prior uploads and index writes have landed in S3
|
||||
// before emitting deletions.
|
||||
if let Err(e) = self.wait_completion().await {
|
||||
// This can only fail if upload queue is shut down: if this happens, we do
|
||||
// not emit any deletions. In this condition (remote client is shut down
|
||||
// during compaction or GC) we may leak some objects.
|
||||
bail!("Cannot complete layer file deletions during shutdown ({e})");
|
||||
}
|
||||
|
||||
// Enqueue deletions
|
||||
deletion_queue_client
|
||||
.push_layers(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.generation,
|
||||
with_generations,
|
||||
)
|
||||
.await?;
|
||||
no_bail_here();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -783,14 +754,15 @@ impl RemoteTimelineClient {
|
||||
pausable_failpoint!("persist_deleted_index_part");
|
||||
|
||||
backoff::retry(
|
||||
|| {
|
||||
|| async {
|
||||
upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
self.generation,
|
||||
&index_part_with_deleted_at,
|
||||
)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
1,
|
||||
@@ -799,8 +771,6 @@ impl RemoteTimelineClient {
|
||||
// when executed as part of tenant deletion this happens in the background
|
||||
2,
|
||||
"persist_index_part_with_deleted_flag",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -825,13 +795,12 @@ impl RemoteTimelineClient {
|
||||
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
|
||||
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
|
||||
/// deletes leaked files if any and proceeds with deletion of index file at the end.
|
||||
pub(crate) async fn delete_all(
|
||||
self: &Arc<Self>,
|
||||
deletion_queue: &DeletionQueueClient,
|
||||
) -> anyhow::Result<()> {
|
||||
pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let layers: Vec<_> = {
|
||||
let (mut receiver, deletions_queued) = {
|
||||
let mut deletions_queued = 0;
|
||||
|
||||
let mut locked = self.upload_queue.lock().unwrap();
|
||||
let stopped = locked.stopped_mut()?;
|
||||
|
||||
@@ -843,29 +812,40 @@ impl RemoteTimelineClient {
|
||||
|
||||
stopped
|
||||
.upload_queue_for_deletion
|
||||
.latest_files
|
||||
.drain()
|
||||
.map(|kv| (kv.0, kv.1.generation))
|
||||
.collect()
|
||||
.queued_operations
|
||||
.reserve(stopped.upload_queue_for_deletion.latest_files.len());
|
||||
|
||||
// schedule the actual deletions
|
||||
for name in stopped.upload_queue_for_deletion.latest_files.keys() {
|
||||
let op = UploadOp::Delete(Delete {
|
||||
file_kind: RemoteOpFileKind::Layer,
|
||||
layer_file_name: name.clone(),
|
||||
scheduled_from_timeline_delete: true,
|
||||
});
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
stopped
|
||||
.upload_queue_for_deletion
|
||||
.queued_operations
|
||||
.push_back(op);
|
||||
|
||||
info!("scheduled layer file deletion {name}");
|
||||
deletions_queued += 1;
|
||||
}
|
||||
|
||||
self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
|
||||
|
||||
(
|
||||
self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
|
||||
deletions_queued,
|
||||
)
|
||||
};
|
||||
|
||||
let layer_deletion_count = layers.len();
|
||||
|
||||
let layer_paths = layers
|
||||
.into_iter()
|
||||
.map(|(layer, generation)| {
|
||||
remote_layer_path(&self.tenant_id, &self.timeline_id, &layer, generation)
|
||||
})
|
||||
.collect();
|
||||
deletion_queue.push_immediate(layer_paths).await?;
|
||||
receiver.changed().await.context("upload queue shut down")?;
|
||||
|
||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
|
||||
|
||||
// Execute all pending deletions, so that when we prroceed to do a list_prefixes below, we aren't
|
||||
// taking the burden of listing all the layers that we already know we should delete.
|
||||
deletion_queue.flush_immediate().await?;
|
||||
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||
let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
|
||||
|
||||
let remaining = backoff::retry(
|
||||
|| async {
|
||||
@@ -877,7 +857,6 @@ impl RemoteTimelineClient {
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"list_prefixes",
|
||||
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
|
||||
)
|
||||
.await
|
||||
.context("list prefixes")?;
|
||||
@@ -894,9 +873,16 @@ impl RemoteTimelineClient {
|
||||
})
|
||||
.collect();
|
||||
|
||||
let not_referenced_count = remaining.len();
|
||||
if !remaining.is_empty() {
|
||||
deletion_queue.push_immediate(remaining).await?;
|
||||
backoff::retry(
|
||||
|| async { self.storage_impl.delete_objects(&remaining).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"delete_objects",
|
||||
)
|
||||
.await
|
||||
.context("delete_objects")?;
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-delete", |_| {
|
||||
@@ -907,14 +893,17 @@ impl RemoteTimelineClient {
|
||||
|
||||
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
|
||||
|
||||
debug!("enqueuing index part deletion");
|
||||
deletion_queue
|
||||
.push_immediate([index_file_path].to_vec())
|
||||
.await?;
|
||||
debug!("deleting index part");
|
||||
|
||||
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
|
||||
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
|
||||
deletion_queue.flush_immediate().await?;
|
||||
backoff::retry(
|
||||
|| async { self.storage_impl.delete(&index_file_path).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"delete_index",
|
||||
)
|
||||
.await
|
||||
.context("delete_index")?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
@@ -922,7 +911,7 @@ impl RemoteTimelineClient {
|
||||
))?
|
||||
});
|
||||
|
||||
info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
|
||||
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -945,6 +934,10 @@ impl RemoteTimelineClient {
|
||||
// have finished.
|
||||
upload_queue.inprogress_tasks.is_empty()
|
||||
}
|
||||
UploadOp::Delete(_) => {
|
||||
// Wait for preceding uploads to finish. Concurrent deletions are OK, though.
|
||||
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
|
||||
}
|
||||
|
||||
UploadOp::Barrier(_) => upload_queue.inprogress_tasks.is_empty(),
|
||||
};
|
||||
@@ -972,6 +965,9 @@ impl RemoteTimelineClient {
|
||||
UploadOp::UploadMetadata(_, _) => {
|
||||
upload_queue.num_inprogress_metadata_uploads += 1;
|
||||
}
|
||||
UploadOp::Delete(_) => {
|
||||
upload_queue.num_inprogress_deletions += 1;
|
||||
}
|
||||
UploadOp::Barrier(sender) => {
|
||||
sender.send_replace(());
|
||||
continue;
|
||||
@@ -1050,17 +1046,15 @@ impl RemoteTimelineClient {
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
|
||||
let path = self
|
||||
let path = &self
|
||||
.conf
|
||||
.timeline_path(&self.tenant_id, &self.timeline_id)
|
||||
.join(layer_file_name.file_name());
|
||||
|
||||
upload::upload_timeline_layer(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
&path,
|
||||
path,
|
||||
layer_metadata,
|
||||
self.generation,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
@@ -1072,20 +1066,11 @@ impl RemoteTimelineClient {
|
||||
.await
|
||||
}
|
||||
UploadOp::UploadMetadata(ref index_part, _lsn) => {
|
||||
let mention_having_future_layers = if cfg!(feature = "testing") {
|
||||
index_part
|
||||
.layer_metadata
|
||||
.keys()
|
||||
.any(|x| x.is_in_future(*_lsn))
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
let res = upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
self.generation,
|
||||
index_part,
|
||||
)
|
||||
.measure_remote_op(
|
||||
@@ -1098,13 +1083,24 @@ impl RemoteTimelineClient {
|
||||
.await;
|
||||
if res.is_ok() {
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
if mention_having_future_layers {
|
||||
// find rationale near crate::tenant::timeline::init::cleanup_future_layer
|
||||
tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
|
||||
}
|
||||
}
|
||||
res
|
||||
}
|
||||
UploadOp::Delete(delete) => {
|
||||
let path = &self
|
||||
.conf
|
||||
.timeline_path(&self.tenant_id, &self.timeline_id)
|
||||
.join(delete.layer_file_name.file_name());
|
||||
delete::delete_layer(self.conf, &self.storage_impl, path)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
delete.file_kind,
|
||||
RemoteOpKind::Delete,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await
|
||||
}
|
||||
UploadOp::Barrier(_) => {
|
||||
// unreachable. Barrier operations are handled synchronously in
|
||||
// launch_queued_tasks
|
||||
@@ -1138,13 +1134,14 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
// sleep until it's time to retry, or we're cancelled
|
||||
exponential_backoff(
|
||||
retries,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
&shutdown_token(),
|
||||
)
|
||||
.await;
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => { },
|
||||
_ = exponential_backoff(
|
||||
retries,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
) => { },
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1164,7 +1161,15 @@ impl RemoteTimelineClient {
|
||||
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = match upload_queue_guard.deref_mut() {
|
||||
UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
|
||||
UploadQueue::Stopped(_) => { None }
|
||||
UploadQueue::Stopped(stopped) => {
|
||||
// Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
|
||||
// then stop() took care of it so we just return.
|
||||
// For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
|
||||
match &task.op {
|
||||
UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
|
||||
_ => None
|
||||
}
|
||||
},
|
||||
UploadQueue::Initialized(qi) => { Some(qi) }
|
||||
};
|
||||
|
||||
@@ -1186,6 +1191,9 @@ impl RemoteTimelineClient {
|
||||
upload_queue.num_inprogress_metadata_uploads -= 1;
|
||||
upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
|
||||
}
|
||||
UploadOp::Delete(_) => {
|
||||
upload_queue.num_inprogress_deletions -= 1;
|
||||
}
|
||||
UploadOp::Barrier(_) => unreachable!(),
|
||||
};
|
||||
|
||||
@@ -1217,6 +1225,13 @@ impl RemoteTimelineClient {
|
||||
reason: "metadata uploads are tiny",
|
||||
},
|
||||
),
|
||||
UploadOp::Delete(delete) => (
|
||||
delete.file_kind,
|
||||
RemoteOpKind::Delete,
|
||||
DontTrackSize {
|
||||
reason: "should we track deletes? positive or negative sign?",
|
||||
},
|
||||
),
|
||||
UploadOp::Barrier(_) => {
|
||||
// we do not account these
|
||||
return None;
|
||||
@@ -1276,6 +1291,7 @@ impl RemoteTimelineClient {
|
||||
last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::default(),
|
||||
queued_operations: VecDeque::default(),
|
||||
};
|
||||
@@ -1296,7 +1312,9 @@ impl RemoteTimelineClient {
|
||||
|
||||
// consistency check
|
||||
assert_eq!(
|
||||
qi.num_inprogress_layer_uploads + qi.num_inprogress_metadata_uploads,
|
||||
qi.num_inprogress_layer_uploads
|
||||
+ qi.num_inprogress_metadata_uploads
|
||||
+ qi.num_inprogress_deletions,
|
||||
qi.inprogress_tasks.len()
|
||||
);
|
||||
|
||||
@@ -1321,84 +1339,14 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
|
||||
let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
|
||||
remote_timelines_path(tenant_id).join(&PathBuf::from(timeline_id.to_string()))
|
||||
}
|
||||
|
||||
pub fn remote_layer_path(
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
layer_file_name: &LayerFileName,
|
||||
generation: Generation,
|
||||
) -> RemotePath {
|
||||
// Generation-aware key format
|
||||
let path = format!(
|
||||
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
|
||||
layer_file_name.file_name(),
|
||||
generation.get_suffix()
|
||||
);
|
||||
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_index_path(
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
generation: Generation,
|
||||
) -> RemotePath {
|
||||
RemotePath::from_string(&format!(
|
||||
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
|
||||
IndexPart::FILE_NAME,
|
||||
generation.get_suffix()
|
||||
))
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
/// Files on the remote storage are stored with paths, relative to the workdir.
|
||||
/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
|
||||
///
|
||||
/// Errors if the path provided does not start from pageserver's workdir.
|
||||
pub fn remote_path(
|
||||
conf: &PageServerConf,
|
||||
local_path: &Path,
|
||||
generation: Option<Generation>,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let stripped = local_path
|
||||
.strip_prefix(&conf.workdir)
|
||||
.context("Failed to strip workdir prefix")?;
|
||||
|
||||
let suffixed = if let Some(generation) = generation {
|
||||
format!(
|
||||
"{0}{1}",
|
||||
stripped.to_string_lossy(),
|
||||
generation.get_suffix()
|
||||
)
|
||||
} else {
|
||||
stripped.to_string_lossy().to_string()
|
||||
};
|
||||
|
||||
RemotePath::new(&PathBuf::from(suffixed)).with_context(|| {
|
||||
format!(
|
||||
"Failed to resolve remote part of path {:?} for base {:?}",
|
||||
local_path, conf.workdir
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
deletion_queue::mock::MockDeletionQueue,
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
Generation, Tenant, Timeline,
|
||||
Tenant, Timeline,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
@@ -1440,11 +1388,8 @@ mod tests {
|
||||
assert_eq!(avec, bvec);
|
||||
}
|
||||
|
||||
fn assert_remote_files(expected: &[&str], remote_path: &Path, generation: Generation) {
|
||||
let mut expected: Vec<String> = expected
|
||||
.iter()
|
||||
.map(|x| format!("{}{}", x, generation.get_suffix()))
|
||||
.collect();
|
||||
fn assert_remote_files(expected: &[&str], remote_path: &Path) {
|
||||
let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
|
||||
expected.sort();
|
||||
|
||||
let mut found: Vec<String> = Vec::new();
|
||||
@@ -1465,7 +1410,6 @@ mod tests {
|
||||
tenant_ctx: RequestContext,
|
||||
remote_fs_dir: PathBuf,
|
||||
client: Arc<RemoteTimelineClient>,
|
||||
deletion_queue: MockDeletionQueue,
|
||||
}
|
||||
|
||||
impl TestSetup {
|
||||
@@ -1496,8 +1440,6 @@ mod tests {
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
};
|
||||
|
||||
let generation = Generation::new(0xdeadbeef);
|
||||
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||
|
||||
let client = Arc::new(RemoteTimelineClient {
|
||||
@@ -1505,8 +1447,7 @@ mod tests {
|
||||
runtime: tokio::runtime::Handle::current(),
|
||||
tenant_id: harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
generation,
|
||||
storage_impl: storage.clone(),
|
||||
storage_impl: storage,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
||||
&harness.tenant_id,
|
||||
@@ -1514,8 +1455,6 @@ mod tests {
|
||||
)),
|
||||
});
|
||||
|
||||
let deletion_queue = MockDeletionQueue::new(Some(storage));
|
||||
|
||||
Ok(Self {
|
||||
harness,
|
||||
tenant,
|
||||
@@ -1523,7 +1462,6 @@ mod tests {
|
||||
tenant_ctx: ctx,
|
||||
remote_fs_dir,
|
||||
client,
|
||||
deletion_queue,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1552,7 +1490,6 @@ mod tests {
|
||||
tenant_ctx: _tenant_ctx,
|
||||
remote_fs_dir,
|
||||
client,
|
||||
deletion_queue,
|
||||
} = TestSetup::new("upload_scheduling").await.unwrap();
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
@@ -1568,8 +1505,6 @@ mod tests {
|
||||
.init_upload_queue_for_empty_remote(&metadata)
|
||||
.unwrap();
|
||||
|
||||
let generation = Generation::new(0xdeadbeef);
|
||||
|
||||
// Create a couple of dummy files, schedule upload for them
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
|
||||
@@ -1589,13 +1524,13 @@ mod tests {
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_1,
|
||||
&LayerFileMetadata::new(content_1.len() as u64, generation),
|
||||
&LayerFileMetadata::new(content_1.len() as u64),
|
||||
)
|
||||
.unwrap();
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_2,
|
||||
&LayerFileMetadata::new(content_2.len() as u64, generation),
|
||||
&LayerFileMetadata::new(content_2.len() as u64),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -1653,74 +1588,41 @@ mod tests {
|
||||
&layer_file_name_2.file_name(),
|
||||
],
|
||||
);
|
||||
assert_eq!(index_part.metadata, metadata);
|
||||
let downloaded_metadata = index_part.parse_metadata().unwrap();
|
||||
assert_eq!(downloaded_metadata, metadata);
|
||||
|
||||
// Schedule upload and then a deletion. Check that the deletion is queued
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_3,
|
||||
&LayerFileMetadata::new(content_3.len() as u64, generation),
|
||||
&LayerFileMetadata::new(content_3.len() as u64),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
assert_eq!(upload_queue.queued_operations.len(), 0);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
|
||||
}
|
||||
|
||||
assert_remote_files(
|
||||
&[
|
||||
&layer_file_name_1.file_name(),
|
||||
&layer_file_name_2.file_name(),
|
||||
"index_part.json",
|
||||
],
|
||||
&remote_timeline_dir,
|
||||
generation,
|
||||
);
|
||||
|
||||
client
|
||||
.schedule_layer_file_deletion(
|
||||
&[layer_file_name_1.clone()],
|
||||
&deletion_queue.new_client(),
|
||||
)
|
||||
.await
|
||||
.schedule_layer_file_deletion(&[layer_file_name_1.clone()])
|
||||
.unwrap();
|
||||
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
|
||||
// Deletion schedules upload of the index file via RemoteTimelineClient, and
|
||||
// deletion of layer files via DeletionQueue. The uploads have all been flushed
|
||||
// because schedule_layer_file_deletion does a wait_completion before pushing
|
||||
// to the deletion_queue
|
||||
assert_eq!(upload_queue.queued_operations.len(), 0);
|
||||
assert_eq!(upload_queue.inprogress_tasks.len(), 0);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads, 0);
|
||||
assert_eq!(
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||
0
|
||||
);
|
||||
// Deletion schedules upload of the index file, and the file deletion itself
|
||||
assert!(upload_queue.queued_operations.len() == 2);
|
||||
assert!(upload_queue.inprogress_tasks.len() == 1);
|
||||
assert!(upload_queue.num_inprogress_layer_uploads == 1);
|
||||
assert!(upload_queue.num_inprogress_deletions == 0);
|
||||
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
|
||||
}
|
||||
assert_remote_files(
|
||||
&[
|
||||
&layer_file_name_1.file_name(),
|
||||
&layer_file_name_2.file_name(),
|
||||
&layer_file_name_3.file_name(),
|
||||
"index_part.json",
|
||||
],
|
||||
&remote_timeline_dir,
|
||||
generation,
|
||||
);
|
||||
|
||||
// Finish uploads and deletions
|
||||
// Finish them
|
||||
client.wait_completion().await.unwrap();
|
||||
deletion_queue.pump().await;
|
||||
|
||||
// 1 layer was deleted
|
||||
assert_eq!(deletion_queue.get_executed(), 1);
|
||||
|
||||
assert_remote_files(
|
||||
&[
|
||||
@@ -1729,7 +1631,6 @@ mod tests {
|
||||
"index_part.json",
|
||||
],
|
||||
&remote_timeline_dir,
|
||||
generation,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1782,14 +1683,12 @@ mod tests {
|
||||
|
||||
// Test
|
||||
|
||||
let generation = Generation::new(0xdeadbeef);
|
||||
|
||||
let init = get_bytes_started_stopped();
|
||||
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_1,
|
||||
&LayerFileMetadata::new(content_1.len() as u64, generation),
|
||||
&LayerFileMetadata::new(content_1.len() as u64),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -1824,23 +1723,4 @@ mod tests {
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// #[tokio::test]
|
||||
// async fn index_part_download() {
|
||||
// let TestSetup {
|
||||
// harness,
|
||||
// tenant: _tenant,
|
||||
// timeline: _timeline,
|
||||
// client,
|
||||
// ..
|
||||
// } = TestSetup::new("index_part_download").await.unwrap();
|
||||
|
||||
// let example_index_part = IndexPart {
|
||||
// version: 3,
|
||||
// timeline_layers: HashSet::new(),
|
||||
// layer_metadata:
|
||||
|
||||
// }
|
||||
|
||||
// }
|
||||
}
|
||||
|
||||
29
pageserver/src/tenant/remote_timeline_client/delete.rs
Normal file
29
pageserver/src/tenant/remote_timeline_client/delete.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
//! Helper functions to delete files from remote storage with a RemoteStorage
|
||||
use anyhow::Context;
|
||||
use std::path::Path;
|
||||
use tracing::debug;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
|
||||
pub(super) async fn delete_layer<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
local_layer_path: &'a Path,
|
||||
) -> anyhow::Result<()> {
|
||||
fail::fail_point!("before-delete-layer", |_| {
|
||||
anyhow::bail!("failpoint before-delete-layer")
|
||||
});
|
||||
debug!("Deleting layer from remote storage: {local_layer_path:?}",);
|
||||
|
||||
let path_to_delete = conf.remote_path(local_layer_path)?;
|
||||
|
||||
// We don't want to print an error if the delete failed if the file has
|
||||
// already been deleted. Thankfully, in this situation S3 already
|
||||
// does not yield an error. While OS-provided local file system APIs do yield
|
||||
// errors, we avoid them in the `LocalFs` wrapper.
|
||||
storage.delete(&path_to_delete).await.with_context(|| {
|
||||
format!("Failed to delete remote layer from storage at {path_to_delete:?}")
|
||||
})
|
||||
}
|
||||
@@ -11,20 +11,17 @@ use std::time::Duration;
|
||||
use anyhow::{anyhow, Context};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{backoff, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::Generation;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::{remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
|
||||
use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
|
||||
|
||||
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
||||
|
||||
@@ -43,16 +40,13 @@ pub async fn download_layer_file<'a>(
|
||||
) -> Result<u64, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let local_path = conf
|
||||
.timeline_path(&tenant_id, &timeline_id)
|
||||
.join(layer_file_name.file_name());
|
||||
let timeline_path = conf.timeline_path(&tenant_id, &timeline_id);
|
||||
|
||||
let remote_path = remote_layer_path(
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
layer_file_name,
|
||||
layer_metadata.generation,
|
||||
);
|
||||
let local_path = timeline_path.join(layer_file_name.file_name());
|
||||
|
||||
let remote_path = conf
|
||||
.remote_path(&local_path)
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||
// The sequence:
|
||||
@@ -178,19 +172,21 @@ pub fn is_temp_download_file(path: &Path) -> bool {
|
||||
}
|
||||
|
||||
/// List timelines of given tenant in remote storage
|
||||
pub async fn list_remote_timelines(
|
||||
storage: &GenericRemoteStorage,
|
||||
pub async fn list_remote_timelines<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<HashSet<TimelineId>> {
|
||||
let remote_path = remote_timelines_path(&tenant_id);
|
||||
let tenant_path = conf.timelines_path(&tenant_id);
|
||||
let tenant_storage_path = conf.remote_path(&tenant_path)?;
|
||||
|
||||
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
|
||||
anyhow::bail!("storage-sync-list-remote-timelines");
|
||||
});
|
||||
|
||||
let timelines = download_retry(
|
||||
|| storage.list_prefixes(Some(&remote_path)),
|
||||
&format!("list prefixes for {tenant_id}"),
|
||||
|| storage.list_prefixes(Some(&tenant_storage_path)),
|
||||
&format!("list prefixes for {tenant_path:?}"),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -224,140 +220,46 @@ pub async fn list_remote_timelines(
|
||||
Ok(timeline_ids)
|
||||
}
|
||||
|
||||
async fn do_download_index_part(
|
||||
local_path: &Path,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
index_generation: Generation,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
|
||||
|
||||
let index_part_bytes = download_retry(
|
||||
|| storage.download_all(&remote_path),
|
||||
&format!("download {remote_path:?}"),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
|
||||
.with_context(|| format!("Failed to deserialize index part file into file {local_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
pub(super) async fn download_index_part(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
my_generation: Generation,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
let local_path = conf
|
||||
let index_part_path = conf
|
||||
.metadata_path(tenant_id, timeline_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let part_storage_path = conf
|
||||
.remote_path(&index_part_path)
|
||||
.map_err(DownloadError::BadInput)?;
|
||||
|
||||
if my_generation.is_none() {
|
||||
// Operating without generations: just fetch the generation-less path
|
||||
return do_download_index_part(&local_path, storage, tenant_id, timeline_id, my_generation)
|
||||
.await;
|
||||
}
|
||||
let index_part_bytes = download_retry(
|
||||
|| async {
|
||||
let mut index_part_download = storage.download(&part_storage_path).await?;
|
||||
|
||||
let previous_gen = my_generation.previous();
|
||||
let r_previous =
|
||||
do_download_index_part(&local_path, storage, tenant_id, timeline_id, previous_gen).await;
|
||||
|
||||
match r_previous {
|
||||
Ok(index_part) => {
|
||||
tracing::debug!("Found index_part from previous generation {previous_gen}");
|
||||
return Ok(index_part);
|
||||
}
|
||||
Err(e) => {
|
||||
if matches!(e, DownloadError::NotFound) {
|
||||
tracing::debug!("No index_part found from previous generation {previous_gen}, falling back to listing");
|
||||
} else {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Given the key of an index, parse out the generation part of the name
|
||||
fn parse_generation(path: RemotePath) -> Option<Generation> {
|
||||
let path = path.take();
|
||||
let file_name = match path.file_name() {
|
||||
Some(f) => f,
|
||||
None => {
|
||||
// Unexpected: we should be seeing index_part.json paths only
|
||||
tracing::warn!("Malformed index key {0}", path.display());
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let file_name_str = match file_name.to_str() {
|
||||
Some(s) => s,
|
||||
None => {
|
||||
tracing::warn!("Malformed index key {0}", path.display());
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
match file_name_str.split_once("-") {
|
||||
Some((_, gen_suffix)) => u32::from_str_radix(gen_suffix, 16)
|
||||
.map(|g| Generation::new(g))
|
||||
.ok(),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: we did not find an index_part.json from the previous generation, so
|
||||
// we will list all the index_part objects and pick the most recent.
|
||||
let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none());
|
||||
let indices = backoff::retry(
|
||||
|| async { storage.list_files(Some(&index_prefix)).await },
|
||||
|_| false,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"listing index_part files",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || -> anyhow::Error {
|
||||
unreachable!()
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(e))?;
|
||||
|
||||
let mut generations: Vec<_> = indices
|
||||
.into_iter()
|
||||
.filter_map(|k| parse_generation(k))
|
||||
.filter(|g| g <= &my_generation)
|
||||
.collect();
|
||||
|
||||
generations.sort();
|
||||
match generations.last() {
|
||||
Some(g) => {
|
||||
tracing::debug!("Found index_part in generation {g} (my generation {my_generation})");
|
||||
do_download_index_part(&local_path, storage, tenant_id, timeline_id, *g).await
|
||||
}
|
||||
None => {
|
||||
// This is not an error: the timeline may be newly created, or we may be
|
||||
// upgrading and have no historical index_part with a generation suffix.
|
||||
// Fall back to trying to load the un-suffixed index_part.json.
|
||||
tracing::info!(
|
||||
"No index_part.json-* found when loading {}/{} in generation {}",
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
my_generation
|
||||
);
|
||||
return do_download_index_part(
|
||||
&local_path,
|
||||
storage,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
Generation::none(),
|
||||
let mut index_part_bytes = Vec::new();
|
||||
tokio::io::copy(
|
||||
&mut index_part_download.download_stream,
|
||||
&mut index_part_bytes,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to download an index part into file {index_part_path:?}")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
Ok(index_part_bytes)
|
||||
},
|
||||
&format!("download {part_storage_path:?}"),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
|
||||
.with_context(|| {
|
||||
format!("Failed to deserialize index part file into file {index_part_path:?}")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
/// Helper function to handle retries for a download operation.
|
||||
@@ -378,10 +280,6 @@ where
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
description,
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
|
||||
unreachable!()
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -12,7 +12,6 @@ use utils::bin_ser::SerializeError;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::upload_queue::UploadQueueInitialized;
|
||||
use crate::tenant::Generation;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -21,28 +20,22 @@ use utils::lsn::Lsn;
|
||||
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
//#[cfg_attr(test, derive(Default))]
|
||||
#[cfg_attr(test, derive(Default))]
|
||||
pub struct LayerFileMetadata {
|
||||
file_size: u64,
|
||||
|
||||
pub(crate) generation: Generation,
|
||||
}
|
||||
|
||||
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
||||
fn from(other: &IndexLayerMetadata) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size: other.file_size,
|
||||
generation: other.generation,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerFileMetadata {
|
||||
pub fn new(file_size: u64, generation: Generation) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size,
|
||||
generation,
|
||||
}
|
||||
pub fn new(file_size: u64) -> Self {
|
||||
LayerFileMetadata { file_size }
|
||||
}
|
||||
|
||||
pub fn file_size(&self) -> u64 {
|
||||
@@ -84,9 +77,7 @@ pub struct IndexPart {
|
||||
// private because internally we would read from metadata instead.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
disk_consistent_lsn: Lsn,
|
||||
|
||||
#[serde(rename = "metadata_bytes")]
|
||||
pub metadata: TimelineMetadata,
|
||||
metadata_bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
@@ -104,7 +95,7 @@ impl IndexPart {
|
||||
pub fn new(
|
||||
layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata: TimelineMetadata,
|
||||
metadata_bytes: Vec<u8>,
|
||||
) -> Self {
|
||||
let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
|
||||
let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
|
||||
@@ -120,10 +111,14 @@ impl IndexPart {
|
||||
timeline_layers,
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
metadata,
|
||||
metadata_bytes,
|
||||
deleted_at: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_metadata(&self) -> anyhow::Result<TimelineMetadata> {
|
||||
TimelineMetadata::from_bytes(&self.metadata_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&UploadQueueInitialized> for IndexPart {
|
||||
@@ -131,31 +126,26 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart {
|
||||
|
||||
fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
let metadata = upload_queue.latest_metadata.clone();
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
|
||||
Ok(Self::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
metadata,
|
||||
metadata_bytes,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialized form of [`LayerFileMetadata`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct IndexLayerMetadata {
|
||||
pub(super) file_size: u64,
|
||||
|
||||
#[serde(default = "Generation::none")]
|
||||
#[serde(skip_serializing_if = "Generation::is_none")]
|
||||
pub(super) generation: Generation,
|
||||
}
|
||||
|
||||
impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
|
||||
fn from(other: &'_ LayerFileMetadata) -> Self {
|
||||
IndexLayerMetadata {
|
||||
file_size: other.file_size,
|
||||
generation: other.generation,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -184,17 +174,15 @@ mod tests {
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
deleted_at: None,
|
||||
};
|
||||
|
||||
@@ -213,7 +201,7 @@ mod tests {
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
@@ -223,17 +211,15 @@ mod tests {
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
deleted_at: None,
|
||||
};
|
||||
|
||||
@@ -252,7 +238,7 @@ mod tests {
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
|
||||
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
|
||||
"deleted_at": "2023-07-31T09:00:00.123"
|
||||
}"#;
|
||||
|
||||
@@ -263,17 +249,15 @@ mod tests {
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
|
||||
};
|
||||
@@ -297,7 +281,7 @@ mod tests {
|
||||
timeline_layers: HashSet::new(),
|
||||
layer_metadata: HashMap::new(),
|
||||
disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[
|
||||
metadata_bytes: [
|
||||
136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
|
||||
38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
|
||||
210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
|
||||
@@ -318,8 +302,8 @@ mod tests {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0,
|
||||
])
|
||||
.unwrap(),
|
||||
]
|
||||
.to_vec(),
|
||||
deleted_at: None,
|
||||
};
|
||||
|
||||
|
||||
@@ -5,11 +5,7 @@ use fail::fail_point;
|
||||
use std::{io::ErrorKind, path::Path};
|
||||
use tokio::fs;
|
||||
|
||||
use super::Generation;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
tenant::remote_timeline_client::{index::IndexPart, remote_index_path, remote_path},
|
||||
};
|
||||
use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
@@ -19,10 +15,10 @@ use tracing::info;
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
generation: Generation,
|
||||
index_part: &'a IndexPart,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::trace!("uploading new index part");
|
||||
@@ -36,9 +32,13 @@ pub(super) async fn upload_index_part<'a>(
|
||||
let index_part_size = index_part_bytes.len();
|
||||
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
||||
|
||||
let remote_path = remote_index_path(tenant_id, timeline_id, generation);
|
||||
let index_part_path = conf
|
||||
.metadata_path(tenant_id, timeline_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let storage_path = conf.remote_path(&index_part_path)?;
|
||||
|
||||
storage
|
||||
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
|
||||
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
@@ -52,13 +52,12 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
source_path: &'a Path,
|
||||
known_metadata: &'a LayerFileMetadata,
|
||||
generation: Generation,
|
||||
) -> anyhow::Result<()> {
|
||||
fail_point!("before-upload-layer", |_| {
|
||||
bail!("failpoint before-upload-layer")
|
||||
});
|
||||
let storage_path = conf.remote_path(source_path)?;
|
||||
|
||||
let storage_path = remote_path(conf, source_path, Some(generation))?;
|
||||
let source_file_res = fs::File::open(&source_path).await;
|
||||
let source_file = match source_file_res {
|
||||
Ok(source_file) => source_file,
|
||||
|
||||
@@ -41,6 +41,8 @@ pub use inmemory_layer::InMemoryLayer;
|
||||
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
pub use remote_layer::RemoteLayer;
|
||||
|
||||
use super::timeline::layer_manager::LayerManager;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
T: PartialOrd<T>,
|
||||
@@ -173,9 +175,16 @@ impl LayerAccessStats {
|
||||
///
|
||||
/// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
|
||||
pub(crate) fn for_loading_layer(
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
status: LayerResidenceStatus,
|
||||
) -> Self {
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
||||
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
|
||||
new.record_residence_event(
|
||||
layer_map_lock_held_witness,
|
||||
status,
|
||||
LayerResidenceEventReason::LayerLoad,
|
||||
);
|
||||
new
|
||||
}
|
||||
|
||||
@@ -188,6 +197,7 @@ impl LayerAccessStats {
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn clone_for_residence_change(
|
||||
&self,
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
new_status: LayerResidenceStatus,
|
||||
) -> LayerAccessStats {
|
||||
let clone = {
|
||||
@@ -195,7 +205,11 @@ impl LayerAccessStats {
|
||||
inner.clone()
|
||||
};
|
||||
let new = LayerAccessStats(Mutex::new(clone));
|
||||
new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
|
||||
new.record_residence_event(
|
||||
layer_map_lock_held_witness,
|
||||
new_status,
|
||||
LayerResidenceEventReason::ResidenceChange,
|
||||
);
|
||||
new
|
||||
}
|
||||
|
||||
@@ -215,6 +229,7 @@ impl LayerAccessStats {
|
||||
///
|
||||
pub(crate) fn record_residence_event(
|
||||
&self,
|
||||
_layer_map_lock_held_witness: &LayerManager,
|
||||
status: LayerResidenceStatus,
|
||||
reason: LayerResidenceEventReason,
|
||||
) {
|
||||
|
||||
@@ -318,28 +318,30 @@ impl DeltaLayer {
|
||||
|
||||
tree_reader.dump().await?;
|
||||
|
||||
let keys = DeltaLayerInner::load_keys(&inner).await?;
|
||||
let keys = DeltaLayerInner::load_keys(&Ref(&**inner)).await?;
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
async fn dump_blob(val: ValueRef<'_>) -> Result<String> {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
}
|
||||
let dump_blob = |val: ValueRef<_>| -> _ {
|
||||
async move {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
}
|
||||
};
|
||||
|
||||
for entry in keys {
|
||||
let DeltaEntry { key, lsn, val, .. } = entry;
|
||||
@@ -467,7 +469,7 @@ impl DeltaLayer {
|
||||
PathOrConf::Path(_) => None,
|
||||
};
|
||||
|
||||
let loaded = DeltaLayerInner::load(&path, summary).await?;
|
||||
let loaded = DeltaLayerInner::load(&path, summary)?;
|
||||
|
||||
if let PathOrConf::Path(ref path) = self.path_or_conf {
|
||||
// not production code
|
||||
@@ -550,12 +552,17 @@ impl DeltaLayer {
|
||||
/// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
|
||||
///
|
||||
/// The value can be obtained via the [`ValueRef::load`] function.
|
||||
pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
|
||||
pub(crate) async fn load_keys(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<DeltaEntry<Ref<&'_ DeltaLayerInner>>>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::KeyIter, ctx)
|
||||
.await
|
||||
.context("load delta layer keys")?;
|
||||
DeltaLayerInner::load_keys(inner)
|
||||
|
||||
let inner = Ref(&**inner);
|
||||
DeltaLayerInner::load_keys(&inner)
|
||||
.await
|
||||
.context("Layer index is corrupted")
|
||||
}
|
||||
@@ -841,15 +848,12 @@ impl Drop for DeltaLayerWriter {
|
||||
}
|
||||
|
||||
impl DeltaLayerInner {
|
||||
pub(super) async fn load(
|
||||
path: &std::path::Path,
|
||||
summary: Option<Summary>,
|
||||
) -> anyhow::Result<Self> {
|
||||
pub(super) fn load(path: &std::path::Path, summary: Option<Summary>) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
|
||||
let summary_blk = file.read_blk(0).await?;
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
if let Some(mut expected_summary) = summary {
|
||||
@@ -954,14 +958,14 @@ impl DeltaLayerInner {
|
||||
|
||||
pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
|
||||
this: &T,
|
||||
) -> Result<Vec<DeltaEntry<'_>>> {
|
||||
) -> Result<Vec<DeltaEntry<T>>> {
|
||||
let dl = this.as_ref();
|
||||
let file = &dl.file;
|
||||
|
||||
let tree_reader =
|
||||
DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
|
||||
|
||||
let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();
|
||||
let mut all_keys: Vec<DeltaEntry<T>> = Vec::new();
|
||||
|
||||
tree_reader
|
||||
.visit(
|
||||
@@ -971,9 +975,7 @@ impl DeltaLayerInner {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let val_ref = ValueRef {
|
||||
blob_ref: BlobRef(value),
|
||||
reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
|
||||
Adapter(dl),
|
||||
)),
|
||||
reader: BlockCursor::new(Adapter(this.clone())),
|
||||
};
|
||||
let pos = BlobRef(value).pos();
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
@@ -1002,23 +1004,43 @@ impl DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// Cloneable borrow wrapper to make borrows behave like smart pointers.
|
||||
///
|
||||
/// Shared references are trivially copyable. This wrapper avoids (confusion) to otherwise attempt
|
||||
/// cloning DeltaLayerInner.
|
||||
pub(crate) struct Ref<T>(T);
|
||||
|
||||
impl<'a, T> AsRef<T> for Ref<&'a T> {
|
||||
fn as_ref(&self) -> &T {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Clone for Ref<&'a T> {
|
||||
fn clone(&self) -> Self {
|
||||
*self
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Copy for Ref<&'a T> {}
|
||||
|
||||
/// A set of data associated with a delta layer key and its value
|
||||
pub struct DeltaEntry<'a> {
|
||||
pub struct DeltaEntry<T: AsRef<DeltaLayerInner>> {
|
||||
pub key: Key,
|
||||
pub lsn: Lsn,
|
||||
/// Size of the stored value
|
||||
pub size: u64,
|
||||
/// Reference to the on-disk value
|
||||
pub val: ValueRef<'a>,
|
||||
pub val: ValueRef<T>,
|
||||
}
|
||||
|
||||
/// Reference to an on-disk value
|
||||
pub struct ValueRef<'a> {
|
||||
pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
|
||||
blob_ref: BlobRef,
|
||||
reader: BlockCursor<'a>,
|
||||
reader: BlockCursor<Adapter<T>>,
|
||||
}
|
||||
|
||||
impl<'a> ValueRef<'a> {
|
||||
impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
|
||||
/// Loads the value from disk
|
||||
pub async fn load(&self) -> Result<Value> {
|
||||
// theoretically we *could* record an access time for each, but it does not really matter
|
||||
@@ -1028,10 +1050,10 @@ impl<'a> ValueRef<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct Adapter<T>(T);
|
||||
struct Adapter<T: AsRef<DeltaLayerInner>>(T);
|
||||
|
||||
impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
|
||||
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
self.0.as_ref().file.read_blk(blknum).await
|
||||
impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
self.0.as_ref().file.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,20 +212,9 @@ pub enum LayerFileName {
|
||||
}
|
||||
|
||||
impl LayerFileName {
|
||||
pub(crate) fn file_name(&self) -> String {
|
||||
pub fn file_name(&self) -> String {
|
||||
self.to_string()
|
||||
}
|
||||
|
||||
/// Determines if this layer file is considered to be in future meaning we will discard these
|
||||
/// layers during timeline initialization from the given disk_consistent_lsn.
|
||||
pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
|
||||
use LayerFileName::*;
|
||||
match self {
|
||||
Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
|
||||
Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for LayerFileName {
|
||||
@@ -274,8 +263,8 @@ impl serde::Serialize for LayerFileName {
|
||||
S: serde::Serializer,
|
||||
{
|
||||
match self {
|
||||
Self::Image(fname) => serializer.collect_str(fname),
|
||||
Self::Delta(fname) => serializer.collect_str(fname),
|
||||
Self::Image(fname) => serializer.serialize_str(&fname.to_string()),
|
||||
Self::Delta(fname) => serializer.serialize_str(&fname.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -349,8 +349,7 @@ impl ImageLayer {
|
||||
PathOrConf::Path(_) => None,
|
||||
};
|
||||
|
||||
let loaded =
|
||||
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary).await?;
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary)?;
|
||||
|
||||
if let PathOrConf::Path(ref path) = self.path_or_conf {
|
||||
// not production code
|
||||
@@ -433,7 +432,7 @@ impl ImageLayer {
|
||||
}
|
||||
|
||||
impl ImageLayerInner {
|
||||
pub(super) async fn load(
|
||||
pub(super) fn load(
|
||||
path: &std::path::Path,
|
||||
lsn: Lsn,
|
||||
summary: Option<Summary>,
|
||||
@@ -441,7 +440,7 @@ impl ImageLayerInner {
|
||||
let file = VirtualFile::open(path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = file.read_blk(0).await?;
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
if let Some(mut expected_summary) = summary {
|
||||
|
||||
@@ -7,12 +7,14 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
|
||||
use crate::walrecord;
|
||||
use anyhow::{ensure, Result};
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::OnceLock;
|
||||
use tracing::*;
|
||||
@@ -30,6 +32,12 @@ use tokio::sync::RwLock;
|
||||
|
||||
use super::{DeltaLayer, DeltaLayerWriter, Layer};
|
||||
|
||||
thread_local! {
|
||||
/// A buffer for serializing object during [`InMemoryLayer::put_value`].
|
||||
/// This buffer is reused for each serialization to avoid additional malloc calls.
|
||||
static SER_BUFFER: RefCell<Vec<u8>> = RefCell::new(Vec::new());
|
||||
}
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
@@ -230,7 +238,7 @@ impl InMemoryLayer {
|
||||
///
|
||||
pub async fn size(&self) -> Result<u64> {
|
||||
let inner = self.inner.read().await;
|
||||
Ok(inner.file.len())
|
||||
Ok(inner.file.size())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -265,17 +273,17 @@ impl InMemoryLayer {
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
let inner: &mut _ = &mut *self.inner.write().await;
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
|
||||
let off = {
|
||||
// Avoid doing allocations for "small" values.
|
||||
// In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
|
||||
// https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
|
||||
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
|
||||
buf.clear();
|
||||
val.ser_into(&mut buf)?;
|
||||
inner.file.write_blob(&buf).await?
|
||||
SER_BUFFER.with(|x| -> Result<_> {
|
||||
let mut buf = x.borrow_mut();
|
||||
buf.clear();
|
||||
val.ser_into(&mut (*buf))?;
|
||||
let off = inner.file.write_blob(&buf)?;
|
||||
Ok(off)
|
||||
})?
|
||||
};
|
||||
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
|
||||
@@ -185,7 +185,7 @@ impl RemoteLayer {
|
||||
/// Create a Layer struct representing this layer, after it has been downloaded.
|
||||
pub(crate) fn create_downloaded_layer(
|
||||
&self,
|
||||
_layer_map_lock_held_witness: &LayerManager,
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
conf: &'static PageServerConf,
|
||||
file_size: u64,
|
||||
) -> Arc<dyn PersistentLayer> {
|
||||
@@ -197,8 +197,10 @@ impl RemoteLayer {
|
||||
self.desc.tenant_id,
|
||||
&fname,
|
||||
file_size,
|
||||
self.access_stats
|
||||
.clone_for_residence_change(LayerResidenceStatus::Resident),
|
||||
self.access_stats.clone_for_residence_change(
|
||||
layer_map_lock_held_witness,
|
||||
LayerResidenceStatus::Resident,
|
||||
),
|
||||
))
|
||||
} else {
|
||||
let fname = self.desc.image_file_name();
|
||||
@@ -208,8 +210,10 @@ impl RemoteLayer {
|
||||
self.desc.tenant_id,
|
||||
&fname,
|
||||
file_size,
|
||||
self.access_stats
|
||||
.clone_for_residence_change(LayerResidenceStatus::Resident),
|
||||
self.access_stats.clone_for_residence_change(
|
||||
layer_map_lock_held_witness,
|
||||
LayerResidenceStatus::Resident,
|
||||
),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
pub mod delete;
|
||||
mod eviction_task;
|
||||
mod init;
|
||||
pub mod layer_manager;
|
||||
mod logical_size;
|
||||
pub mod span;
|
||||
@@ -28,6 +27,7 @@ use utils::id::TenantTimelineId;
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::fs;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::pin::pin;
|
||||
@@ -38,14 +38,15 @@ use std::time::{Duration, Instant, SystemTime};
|
||||
use crate::context::{
|
||||
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
|
||||
};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
|
||||
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
|
||||
use crate::tenant::storage_layer::{
|
||||
DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
|
||||
DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
|
||||
LayerAccessStats, LayerFileName, RemoteLayer,
|
||||
};
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
use crate::tenant::{
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::{save_metadata, TimelineMetadata},
|
||||
par_fsync,
|
||||
@@ -68,7 +69,6 @@ use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::to_pg_timestamp;
|
||||
use utils::{
|
||||
completion,
|
||||
generation::Generation,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::{AtomicLsn, Lsn, RecordLsn},
|
||||
seqwait::SeqWait,
|
||||
@@ -78,10 +78,11 @@ use utils::{
|
||||
use crate::page_cache;
|
||||
use crate::repository::GcResult;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::METADATA_FILE_NAME;
|
||||
use crate::ZERO_PAGE;
|
||||
use crate::{is_temporary, task_mgr};
|
||||
|
||||
use self::delete::DeleteTimelineFlow;
|
||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||
@@ -143,7 +144,6 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
/// The outward-facing resources required to build a Timeline
|
||||
pub struct TimelineResources {
|
||||
pub remote_client: Option<RemoteTimelineClient>,
|
||||
pub deletion_queue_client: Option<DeletionQueueClient>,
|
||||
}
|
||||
|
||||
pub struct Timeline {
|
||||
@@ -155,9 +155,6 @@ pub struct Timeline {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
|
||||
// The generation of the tenant that instantiated us: this is used for safety when writing remote objects
|
||||
generation: Generation,
|
||||
|
||||
pub pg_version: u32,
|
||||
|
||||
/// The tuple has two elements.
|
||||
@@ -201,9 +198,6 @@ pub struct Timeline {
|
||||
/// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
|
||||
pub remote_client: Option<Arc<RemoteTimelineClient>>,
|
||||
|
||||
/// Deletion queue: a global queue, separate to the remote storage queue's
|
||||
deletion_queue_client: Option<Arc<DeletionQueueClient>>,
|
||||
|
||||
// What page versions do we hold in the repository? If we get a
|
||||
// request > last_record_lsn, we need to wait until we receive all
|
||||
// the WAL up to the request. The SeqWait provides functions for
|
||||
@@ -474,7 +468,7 @@ impl Timeline {
|
||||
// The cached image can be returned directly if there is no WAL between the cached image
|
||||
// and requested LSN. The cached image can also be used to reduce the amount of WAL needed
|
||||
// for redo.
|
||||
let cached_page_img = match self.lookup_cached_page(&key, lsn).await {
|
||||
let cached_page_img = match self.lookup_cached_page(&key, lsn) {
|
||||
Some((cached_lsn, cached_img)) => {
|
||||
match cached_lsn.cmp(&lsn) {
|
||||
Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
|
||||
@@ -503,7 +497,6 @@ impl Timeline {
|
||||
|
||||
RECONSTRUCT_TIME
|
||||
.observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
|
||||
.await
|
||||
}
|
||||
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
@@ -1208,7 +1201,7 @@ impl Timeline {
|
||||
Ok(delta) => Some(delta),
|
||||
};
|
||||
|
||||
let layer_metadata = LayerFileMetadata::new(layer_file_size, self.generation);
|
||||
let layer_metadata = LayerFileMetadata::new(layer_file_size);
|
||||
|
||||
let new_remote_layer = Arc::new(match local_layer.filename() {
|
||||
LayerFileName::Image(image_name) => RemoteLayer::new_img(
|
||||
@@ -1218,7 +1211,7 @@ impl Timeline {
|
||||
&layer_metadata,
|
||||
local_layer
|
||||
.access_stats()
|
||||
.clone_for_residence_change(LayerResidenceStatus::Evicted),
|
||||
.clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
|
||||
),
|
||||
LayerFileName::Delta(delta_name) => RemoteLayer::new_delta(
|
||||
self.tenant_id,
|
||||
@@ -1227,7 +1220,7 @@ impl Timeline {
|
||||
&layer_metadata,
|
||||
local_layer
|
||||
.access_stats()
|
||||
.clone_for_residence_change(LayerResidenceStatus::Evicted),
|
||||
.clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
|
||||
),
|
||||
});
|
||||
|
||||
@@ -1271,18 +1264,6 @@ impl Timeline {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_all_remote(&self) -> anyhow::Result<()> {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
if let Some(deletion_queue_client) = &self.deletion_queue_client {
|
||||
remote_client.delete_all(deletion_queue_client).await
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -1398,7 +1379,6 @@ impl Timeline {
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
resources: TimelineResources,
|
||||
pg_version: u32,
|
||||
@@ -1428,7 +1408,6 @@ impl Timeline {
|
||||
myself: myself.clone(),
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
generation,
|
||||
pg_version,
|
||||
layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
|
||||
wanted_image_layers: Mutex::new(None),
|
||||
@@ -1437,7 +1416,6 @@ impl Timeline {
|
||||
walreceiver: Mutex::new(None),
|
||||
|
||||
remote_client: resources.remote_client.map(Arc::new),
|
||||
deletion_queue_client: resources.deletion_queue_client.map(Arc::new),
|
||||
|
||||
// initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
|
||||
last_record_lsn: SeqWait::new(RecordLsn {
|
||||
@@ -1540,7 +1518,7 @@ impl Timeline {
|
||||
let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
|
||||
let self_clone = Arc::clone(self);
|
||||
|
||||
debug!("spawning flush loop");
|
||||
info!("spawning flush loop");
|
||||
*flush_loop_state = FlushLoopState::Running {
|
||||
#[cfg(test)]
|
||||
expect_initdb_optimization: false,
|
||||
@@ -1611,7 +1589,9 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
///
|
||||
/// Initialize with an empty layer map. Used when creating a new timeline.
|
||||
///
|
||||
pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
|
||||
let mut layers = self.layers.try_write().expect(
|
||||
"in the context where we call this function, no other task has access to the object",
|
||||
@@ -1619,16 +1599,10 @@ impl Timeline {
|
||||
layers.initialize_empty(Lsn(start_lsn.0));
|
||||
}
|
||||
|
||||
/// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only
|
||||
/// files.
|
||||
pub(super) async fn load_layer_map(
|
||||
&self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
index_part: Option<IndexPart>,
|
||||
) -> anyhow::Result<()> {
|
||||
use init::{Decision::*, Discovered, FutureLayer};
|
||||
use LayerFileName::*;
|
||||
|
||||
///
|
||||
/// Scan the timeline directory to populate the layer map.
|
||||
///
|
||||
pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
let timer = self.metrics.load_layer_map_histo.start_timer();
|
||||
@@ -1636,164 +1610,102 @@ impl Timeline {
|
||||
// Scan timeline directory and create ImageFileName and DeltaFilename
|
||||
// structs representing all files on disk
|
||||
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||
let (conf, tenant_id, timeline_id) = (self.conf, self.tenant_id, self.timeline_id);
|
||||
let span = tracing::Span::current();
|
||||
// total size of layer files in the current timeline directory
|
||||
let mut total_physical_size = 0;
|
||||
|
||||
// Copy to move into the task we're about to spawn
|
||||
let generation = self.generation;
|
||||
let mut loaded_layers = Vec::<Arc<dyn PersistentLayer>>::new();
|
||||
|
||||
let (loaded_layers, to_sync, total_physical_size) = tokio::task::spawn_blocking({
|
||||
move || {
|
||||
let _g = span.entered();
|
||||
let discovered = init::scan_timeline_dir(&timeline_path)?;
|
||||
let mut discovered_layers = Vec::with_capacity(discovered.len());
|
||||
let mut unrecognized_files = Vec::new();
|
||||
for direntry in fs::read_dir(timeline_path)? {
|
||||
let direntry = direntry?;
|
||||
let direntry_path = direntry.path();
|
||||
let fname = direntry.file_name();
|
||||
let fname = fname.to_string_lossy();
|
||||
|
||||
let mut path = timeline_path;
|
||||
|
||||
for discovered in discovered {
|
||||
let (name, kind) = match discovered {
|
||||
Discovered::Layer(file_name, file_size) => {
|
||||
discovered_layers.push((file_name, file_size));
|
||||
continue;
|
||||
}
|
||||
Discovered::Metadata | Discovered::IgnoredBackup => {
|
||||
continue;
|
||||
}
|
||||
Discovered::Unknown(file_name) => {
|
||||
// we will later error if there are any
|
||||
unrecognized_files.push(file_name);
|
||||
continue;
|
||||
}
|
||||
Discovered::Ephemeral(name) => (name, "old ephemeral file"),
|
||||
Discovered::Temporary(name) => (name, "temporary timeline file"),
|
||||
Discovered::TemporaryDownload(name) => (name, "temporary download"),
|
||||
};
|
||||
path.push(name);
|
||||
init::cleanup(&path, kind)?;
|
||||
path.pop();
|
||||
}
|
||||
|
||||
if !unrecognized_files.is_empty() {
|
||||
// assume that if there are any there are many many.
|
||||
let n = unrecognized_files.len();
|
||||
let first = &unrecognized_files[..n.min(10)];
|
||||
anyhow::bail!(
|
||||
"unrecognized files in timeline dir (total {n}), first 10: {first:?}"
|
||||
if let Some(filename) = ImageFileName::parse_str(&fname) {
|
||||
// create an ImageLayer struct for each image file.
|
||||
if filename.lsn > disk_consistent_lsn {
|
||||
info!(
|
||||
"found future image layer {} on timeline {} disk_consistent_lsn is {}",
|
||||
filename, self.timeline_id, disk_consistent_lsn
|
||||
);
|
||||
|
||||
rename_to_backup(&direntry_path)?;
|
||||
continue;
|
||||
}
|
||||
|
||||
let decided = init::reconcile(
|
||||
discovered_layers,
|
||||
index_part.as_ref(),
|
||||
disk_consistent_lsn,
|
||||
generation,
|
||||
let file_size = direntry_path.metadata()?.len();
|
||||
let stats =
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
|
||||
|
||||
let layer = ImageLayer::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&filename,
|
||||
file_size,
|
||||
stats,
|
||||
);
|
||||
|
||||
let mut loaded_layers = Vec::new();
|
||||
let mut needs_upload = Vec::new();
|
||||
let mut needs_cleanup = Vec::new();
|
||||
let mut total_physical_size = 0;
|
||||
total_physical_size += file_size;
|
||||
loaded_layers.push(Arc::new(layer));
|
||||
} else if let Some(filename) = DeltaFileName::parse_str(&fname) {
|
||||
// Create a DeltaLayer struct for each delta file.
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
// OK for a delta layer to have end LSN 101, but if the end LSN
|
||||
// is 102, then it might not have been fully flushed to disk
|
||||
// before crash.
|
||||
if filename.lsn_range.end > disk_consistent_lsn + 1 {
|
||||
info!(
|
||||
"found future delta layer {} on timeline {} disk_consistent_lsn is {}",
|
||||
filename, self.timeline_id, disk_consistent_lsn
|
||||
);
|
||||
|
||||
for (name, decision) in decided {
|
||||
let decision = match decision {
|
||||
Ok(UseRemote { local, remote }) => {
|
||||
path.push(name.file_name());
|
||||
init::cleanup_local_file_for_remote(&path, &local, &remote)?;
|
||||
path.pop();
|
||||
|
||||
UseRemote { local, remote }
|
||||
}
|
||||
Ok(decision) => decision,
|
||||
Err(FutureLayer { local }) => {
|
||||
if local.is_some() {
|
||||
path.push(name.file_name());
|
||||
init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
|
||||
path.pop();
|
||||
}
|
||||
needs_cleanup.push(name);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match &name {
|
||||
Delta(d) => assert!(d.lsn_range.end <= disk_consistent_lsn + 1),
|
||||
Image(i) => assert!(i.lsn <= disk_consistent_lsn),
|
||||
}
|
||||
|
||||
let status = match &decision {
|
||||
UseLocal(_) | NeedsUpload(_) => LayerResidenceStatus::Resident,
|
||||
Evicted(_) | UseRemote { .. } => LayerResidenceStatus::Evicted,
|
||||
};
|
||||
|
||||
let stats = LayerAccessStats::for_loading_layer(status);
|
||||
|
||||
let layer: Arc<dyn PersistentLayer> = match (name, &decision) {
|
||||
(Delta(d), UseLocal(m) | NeedsUpload(m)) => {
|
||||
total_physical_size += m.file_size();
|
||||
Arc::new(DeltaLayer::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
&d,
|
||||
m.file_size(),
|
||||
stats,
|
||||
))
|
||||
}
|
||||
(Image(i), UseLocal(m) | NeedsUpload(m)) => {
|
||||
total_physical_size += m.file_size();
|
||||
Arc::new(ImageLayer::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
&i,
|
||||
m.file_size(),
|
||||
stats,
|
||||
))
|
||||
}
|
||||
(Delta(d), Evicted(remote) | UseRemote { remote, .. }) => Arc::new(
|
||||
RemoteLayer::new_delta(tenant_id, timeline_id, &d, remote, stats),
|
||||
),
|
||||
(Image(i), Evicted(remote) | UseRemote { remote, .. }) => Arc::new(
|
||||
RemoteLayer::new_img(tenant_id, timeline_id, &i, remote, stats),
|
||||
),
|
||||
};
|
||||
|
||||
if let NeedsUpload(m) = decision {
|
||||
needs_upload.push((layer.clone(), m));
|
||||
}
|
||||
|
||||
loaded_layers.push(layer);
|
||||
rename_to_backup(&direntry_path)?;
|
||||
continue;
|
||||
}
|
||||
Ok((
|
||||
loaded_layers,
|
||||
(needs_upload, needs_cleanup),
|
||||
total_physical_size,
|
||||
))
|
||||
|
||||
let file_size = direntry_path.metadata()?.len();
|
||||
let stats =
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
|
||||
|
||||
let layer = DeltaLayer::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&filename,
|
||||
file_size,
|
||||
stats,
|
||||
);
|
||||
|
||||
total_physical_size += file_size;
|
||||
loaded_layers.push(Arc::new(layer));
|
||||
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
|
||||
// ignore these
|
||||
} else if remote_timeline_client::is_temp_download_file(&direntry_path) {
|
||||
info!(
|
||||
"skipping temp download file, reconcile_with_remote will resume / clean up: {}",
|
||||
fname
|
||||
);
|
||||
} else if is_ephemeral_file(&fname) {
|
||||
// Delete any old ephemeral files
|
||||
trace!("deleting old ephemeral file in timeline dir: {}", fname);
|
||||
fs::remove_file(&direntry_path)?;
|
||||
} else if is_temporary(&direntry_path) {
|
||||
info!("removing temp timeline file at {}", direntry_path.display());
|
||||
fs::remove_file(&direntry_path).with_context(|| {
|
||||
format!(
|
||||
"failed to remove temp download file at {}",
|
||||
direntry_path.display()
|
||||
)
|
||||
})?;
|
||||
} else {
|
||||
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(anyhow::Error::new)
|
||||
.and_then(|x| x)?;
|
||||
}
|
||||
|
||||
let num_layers = loaded_layers.len();
|
||||
|
||||
guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
|
||||
|
||||
if let Some(rtc) = self.remote_client.as_ref() {
|
||||
// Deletion queue client is always Some if remote_client is Some
|
||||
let deletion_queue_client = self.deletion_queue_client.as_ref().unwrap();
|
||||
|
||||
let (needs_upload, needs_cleanup) = to_sync;
|
||||
for (layer, m) in needs_upload {
|
||||
rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
|
||||
}
|
||||
rtc.schedule_layer_file_deletion(&needs_cleanup, deletion_queue_client)
|
||||
.await?;
|
||||
rtc.schedule_index_upload_for_file_changes()?;
|
||||
// Tenant::create_timeline will wait for these uploads to happen before returning, or
|
||||
// on retry.
|
||||
}
|
||||
guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1);
|
||||
|
||||
info!(
|
||||
"loaded layer map with {} layers at {}, total physical size: {}",
|
||||
@@ -1804,6 +1716,236 @@ impl Timeline {
|
||||
.set(total_physical_size);
|
||||
|
||||
timer.stop_and_record();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_remote_layers(
|
||||
&self,
|
||||
index_part: &IndexPart,
|
||||
local_layers: HashMap<LayerFileName, Arc<dyn PersistentLayer>>,
|
||||
up_to_date_disk_consistent_lsn: Lsn,
|
||||
) -> anyhow::Result<HashMap<LayerFileName, Arc<dyn PersistentLayer>>> {
|
||||
// Are we missing some files that are present in remote storage?
|
||||
// Create RemoteLayer instances for them.
|
||||
let mut local_only_layers = local_layers;
|
||||
|
||||
// We're holding a layer map lock for a while but this
|
||||
// method is only called during init so it's fine.
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
let mut corrupted_local_layers = Vec::new();
|
||||
let mut added_remote_layers = Vec::new();
|
||||
for remote_layer_name in index_part.layer_metadata.keys() {
|
||||
let local_layer = local_only_layers.remove(remote_layer_name);
|
||||
|
||||
let remote_layer_metadata = index_part
|
||||
.layer_metadata
|
||||
.get(remote_layer_name)
|
||||
.map(LayerFileMetadata::from)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"No remote layer metadata found for layer {}",
|
||||
remote_layer_name.file_name()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Is the local layer's size different from the size stored in the
|
||||
// remote index file?
|
||||
// If so, rename_to_backup those files & replace their local layer with
|
||||
// a RemoteLayer in the layer map so that we re-download them on-demand.
|
||||
if let Some(local_layer) = local_layer {
|
||||
let local_layer_path = local_layer
|
||||
.local_path()
|
||||
.expect("caller must ensure that local_layers only contains local layers");
|
||||
ensure!(
|
||||
local_layer_path.exists(),
|
||||
"every layer from local_layers must exist on disk: {}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
|
||||
let remote_size = remote_layer_metadata.file_size();
|
||||
let metadata = local_layer_path.metadata().with_context(|| {
|
||||
format!(
|
||||
"get file size of local layer {}",
|
||||
local_layer_path.display()
|
||||
)
|
||||
})?;
|
||||
let local_size = metadata.len();
|
||||
if local_size != remote_size {
|
||||
warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
|
||||
if let Err(err) = rename_to_backup(&local_layer_path) {
|
||||
assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
|
||||
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
||||
} else {
|
||||
self.metrics.resident_physical_size_gauge.sub(local_size);
|
||||
corrupted_local_layers.push(local_layer);
|
||||
// fall-through to adding the remote layer
|
||||
}
|
||||
} else {
|
||||
debug!(
|
||||
"layer is present locally and file size matches remote, using it: {}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"remote layer does not exist locally, creating remote layer: {}",
|
||||
remote_layer_name.file_name()
|
||||
);
|
||||
|
||||
match remote_layer_name {
|
||||
LayerFileName::Image(imgfilename) => {
|
||||
if imgfilename.lsn > up_to_date_disk_consistent_lsn {
|
||||
info!(
|
||||
"found future image layer {} on timeline {} remote_consistent_lsn is {}",
|
||||
imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let stats =
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
|
||||
|
||||
let remote_layer = RemoteLayer::new_img(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
imgfilename,
|
||||
&remote_layer_metadata,
|
||||
stats,
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
added_remote_layers.push(remote_layer);
|
||||
}
|
||||
LayerFileName::Delta(deltafilename) => {
|
||||
// Create a RemoteLayer for the delta file.
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
// OK for a delta layer to have end LSN 101, but if the end LSN
|
||||
// is 102, then it might not have been fully flushed to disk
|
||||
// before crash.
|
||||
if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
|
||||
info!(
|
||||
"found future delta layer {} on timeline {} remote_consistent_lsn is {}",
|
||||
deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let stats =
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
|
||||
|
||||
let remote_layer = RemoteLayer::new_delta(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
deltafilename,
|
||||
&remote_layer_metadata,
|
||||
stats,
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
added_remote_layers.push(remote_layer);
|
||||
}
|
||||
}
|
||||
}
|
||||
guard.initialize_remote_layers(corrupted_local_layers, added_remote_layers);
|
||||
Ok(local_only_layers)
|
||||
}
|
||||
|
||||
/// This function will synchronize local state with what we have in remote storage.
|
||||
///
|
||||
/// Steps taken:
|
||||
/// 1. Initialize upload queue based on `index_part`.
|
||||
/// 2. Create `RemoteLayer` instances for layers that exist only on the remote.
|
||||
/// The list of layers on the remote comes from `index_part`.
|
||||
/// The list of local layers is given by the layer map's `iter_historic_layers()`.
|
||||
/// So, the layer map must have been loaded already.
|
||||
/// 3. Schedule upload of local-only layer files (which will then also update the remote
|
||||
/// IndexPart to include the new layer files).
|
||||
///
|
||||
/// Refer to the [`remote_timeline_client`] module comment for more context.
|
||||
///
|
||||
/// # TODO
|
||||
/// May be a bit cleaner to do things based on populated remote client,
|
||||
/// and then do things based on its upload_queue.latest_files.
|
||||
#[instrument(skip(self, index_part, up_to_date_metadata))]
|
||||
pub async fn reconcile_with_remote(
|
||||
&self,
|
||||
up_to_date_metadata: &TimelineMetadata,
|
||||
index_part: Option<&IndexPart>,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("starting");
|
||||
let remote_client = self
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("cannot download without remote storage"))?;
|
||||
|
||||
let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn();
|
||||
|
||||
let local_layers = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
layers
|
||||
.iter_historic_layers()
|
||||
.map(|l| (l.filename(), guard.get_from_desc(&l)))
|
||||
.collect::<HashMap<_, _>>()
|
||||
};
|
||||
|
||||
// If no writes happen, new branches do not have any layers, only the metadata file.
|
||||
let has_local_layers = !local_layers.is_empty();
|
||||
let local_only_layers = match index_part {
|
||||
Some(index_part) => {
|
||||
info!(
|
||||
"initializing upload queue from remote index with {} layer files",
|
||||
index_part.layer_metadata.len()
|
||||
);
|
||||
remote_client.init_upload_queue(index_part)?;
|
||||
self.create_remote_layers(index_part, local_layers, disk_consistent_lsn)
|
||||
.await?
|
||||
}
|
||||
None => {
|
||||
info!("initializing upload queue as empty");
|
||||
remote_client.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
|
||||
local_layers
|
||||
}
|
||||
};
|
||||
|
||||
if has_local_layers {
|
||||
// Are there local files that don't exist remotely? Schedule uploads for them.
|
||||
// Local timeline metadata will get uploaded to remove along witht he layers.
|
||||
for (layer_name, layer) in &local_only_layers {
|
||||
// XXX solve this in the type system
|
||||
let layer_path = layer
|
||||
.local_path()
|
||||
.expect("local_only_layers only contains local layers");
|
||||
let layer_size = layer_path
|
||||
.metadata()
|
||||
.with_context(|| format!("failed to get file {layer_path:?} metadata"))?
|
||||
.len();
|
||||
info!("scheduling {layer_path:?} for upload");
|
||||
remote_client
|
||||
.schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
|
||||
}
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
} else if index_part.is_none() {
|
||||
// No data on the remote storage, no local layers, local metadata file.
|
||||
//
|
||||
// TODO https://github.com/neondatabase/neon/issues/3865
|
||||
// Currently, console does not wait for the timeline data upload to the remote storage
|
||||
// and considers the timeline created, expecting other pageserver nodes to work with it.
|
||||
// Branch metadata upload could get interrupted (e.g pageserver got killed),
|
||||
// hence any locally existing branch metadata with no remote counterpart should be uploaded,
|
||||
// otherwise any other pageserver won't see the branch on `attach`.
|
||||
//
|
||||
// After the issue gets implemented, pageserver should rather remove the branch,
|
||||
// since absence on S3 means we did not acknowledge the branch creation and console will have to retry,
|
||||
// no need to keep the old files.
|
||||
remote_client.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
|
||||
} else {
|
||||
// Local timeline has a metadata file, remote one too, both have no layers to sync.
|
||||
}
|
||||
|
||||
info!("Done");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2300,15 +2442,7 @@ impl Timeline {
|
||||
)));
|
||||
}
|
||||
}
|
||||
ancestor
|
||||
.wait_lsn(timeline.ancestor_lsn, ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"wait for lsn {} on ancestor timeline_id={}",
|
||||
timeline.ancestor_lsn, ancestor.timeline_id
|
||||
)
|
||||
})?;
|
||||
ancestor.wait_lsn(timeline.ancestor_lsn, ctx).await?;
|
||||
|
||||
timeline_owned = ancestor;
|
||||
timeline = &*timeline_owned;
|
||||
@@ -2487,14 +2621,13 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
async fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> {
|
||||
fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> {
|
||||
let cache = page_cache::get();
|
||||
|
||||
// FIXME: It's pointless to check the cache for things that are not 8kB pages.
|
||||
// We should look at the key to determine if it's a cacheable object
|
||||
let (lsn, read_guard) = cache
|
||||
.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)
|
||||
.await?;
|
||||
let (lsn, read_guard) =
|
||||
cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?;
|
||||
let img = Bytes::from(read_guard.to_vec());
|
||||
Some((lsn, img))
|
||||
}
|
||||
@@ -2704,7 +2837,7 @@ impl Timeline {
|
||||
(
|
||||
HashMap::from([(
|
||||
layer.filename(),
|
||||
LayerFileMetadata::new(layer.layer_desc().file_size, self.generation),
|
||||
LayerFileMetadata::new(layer.layer_desc().file_size),
|
||||
)]),
|
||||
Some(layer),
|
||||
)
|
||||
@@ -2719,6 +2852,7 @@ impl Timeline {
|
||||
if let Some(ref l) = delta_layer_to_add {
|
||||
// TODO: move access stats, metrics update, etc. into layer manager.
|
||||
l.access_stats().record_residence_event(
|
||||
&guard,
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
@@ -3100,16 +3234,14 @@ impl Timeline {
|
||||
.metadata()
|
||||
.with_context(|| format!("reading metadata of layer file {}", path.file_name()))?;
|
||||
|
||||
layer_paths_to_upload.insert(
|
||||
path,
|
||||
LayerFileMetadata::new(metadata.len(), self.generation),
|
||||
);
|
||||
layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
|
||||
|
||||
self.metrics
|
||||
.resident_physical_size_gauge
|
||||
.add(metadata.len());
|
||||
let l = Arc::new(l);
|
||||
l.access_stats().record_residence_event(
|
||||
&guard,
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
@@ -3778,7 +3910,7 @@ impl Timeline {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_layer_file_upload(
|
||||
&l.filename(),
|
||||
&LayerFileMetadata::new(metadata.len(), self.generation),
|
||||
&LayerFileMetadata::new(metadata.len()),
|
||||
)?;
|
||||
}
|
||||
|
||||
@@ -3787,11 +3919,9 @@ impl Timeline {
|
||||
.resident_physical_size_gauge
|
||||
.add(metadata.len());
|
||||
|
||||
new_layer_paths.insert(
|
||||
new_delta_path,
|
||||
LayerFileMetadata::new(metadata.len(), self.generation),
|
||||
);
|
||||
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
|
||||
l.access_stats().record_residence_event(
|
||||
&guard,
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
@@ -3830,13 +3960,7 @@ impl Timeline {
|
||||
|
||||
// Also schedule the deletions in remote storage
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
let deletion_queue = self
|
||||
.deletion_queue_client
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow::anyhow!("Remote storage enabled without deletion queue"))?;
|
||||
remote_client
|
||||
.schedule_layer_file_deletion(&layer_names_to_delete, deletion_queue)
|
||||
.await?;
|
||||
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -4170,15 +4294,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
// Remote metadata upload was scheduled in `update_metadata_file`: wait
|
||||
// for completion before scheduling any deletions.
|
||||
remote_client.wait_completion().await?;
|
||||
let deletion_queue = self.deletion_queue_client.as_ref().ok_or_else(|| {
|
||||
anyhow::anyhow!("Remote storage enabled without deletion queue")
|
||||
})?;
|
||||
remote_client
|
||||
.schedule_layer_file_deletion(&layer_names_to_delete, deletion_queue)
|
||||
.await?;
|
||||
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
|
||||
}
|
||||
|
||||
apply.flush();
|
||||
@@ -4196,7 +4312,7 @@ impl Timeline {
|
||||
///
|
||||
/// Reconstruct a value, using the given base image and WAL records in 'data'.
|
||||
///
|
||||
async fn reconstruct_value(
|
||||
fn reconstruct_value(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
@@ -4265,7 +4381,6 @@ impl Timeline {
|
||||
last_rec_lsn,
|
||||
&img,
|
||||
)
|
||||
.await
|
||||
.context("Materialized page memoization failed")
|
||||
{
|
||||
return Err(PageReconstructError::from(e));
|
||||
@@ -4725,8 +4840,7 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
|
||||
for i in 0u32.. {
|
||||
new_path.set_file_name(format!("{filename}.{i}.old"));
|
||||
if !new_path.exists() {
|
||||
std::fs::rename(path, &new_path)
|
||||
.with_context(|| format!("rename {path:?} to {new_path:?}"))?;
|
||||
std::fs::rename(path, &new_path)?;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
@@ -4768,7 +4882,6 @@ mod tests {
|
||||
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::deletion_queue::mock::MockDeletionQueue;
|
||||
use crate::tenant::{harness::TenantHarness, storage_layer::PersistentLayer};
|
||||
|
||||
use super::{EvictionError, Timeline};
|
||||
@@ -4791,17 +4904,9 @@ mod tests {
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config).unwrap()
|
||||
};
|
||||
let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()), harness.conf);
|
||||
|
||||
let ctx = any_context();
|
||||
let tenant = harness
|
||||
.try_load(
|
||||
&ctx,
|
||||
Some(remote_storage),
|
||||
Some(deletion_queue.new_client()),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
@@ -4864,17 +4969,9 @@ mod tests {
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config).unwrap()
|
||||
};
|
||||
let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()), harness.conf);
|
||||
|
||||
let ctx = any_context();
|
||||
let tenant = harness
|
||||
.try_load(
|
||||
&ctx,
|
||||
Some(remote_storage),
|
||||
Some(deletion_queue.new_client()),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
|
||||
@@ -14,7 +14,6 @@ use utils::{
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
deletion_queue::DeletionQueueClient,
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
metadata::TimelineMetadata,
|
||||
@@ -239,6 +238,15 @@ async fn delete_local_layer_files(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Removes remote layers and an index file after them.
|
||||
async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
|
||||
if let Some(remote_client) = &timeline.remote_client {
|
||||
remote_client.delete_all().await.context("delete_all")?
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This function removs remaining traces of a timeline on disk.
|
||||
// Namely: metadata file, timeline directory, delete mark.
|
||||
// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
|
||||
@@ -399,7 +407,6 @@ impl DeleteTimelineFlow {
|
||||
timeline_id: TimelineId,
|
||||
local_metadata: &TimelineMetadata,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
deletion_queue_client: Option<DeletionQueueClient>,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
|
||||
@@ -409,10 +416,7 @@ impl DeleteTimelineFlow {
|
||||
timeline_id,
|
||||
local_metadata,
|
||||
None, // Ancestor is not needed for deletion.
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
deletion_queue_client,
|
||||
},
|
||||
TimelineResources { remote_client },
|
||||
init_order,
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
@@ -555,7 +559,7 @@ impl DeleteTimelineFlow {
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
|
||||
|
||||
timeline.delete_all_remote().await?;
|
||||
delete_remote_layers_and_index(timeline).await?;
|
||||
|
||||
pausable_failpoint!("in_progress_delete");
|
||||
|
||||
|
||||
@@ -1,213 +0,0 @@
|
||||
use crate::{
|
||||
is_temporary,
|
||||
tenant::{
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
remote_timeline_client::{
|
||||
self,
|
||||
index::{IndexPart, LayerFileMetadata},
|
||||
},
|
||||
storage_layer::LayerFileName,
|
||||
Generation,
|
||||
},
|
||||
METADATA_FILE_NAME,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use std::{collections::HashMap, ffi::OsString, path::Path, str::FromStr};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Identified files in the timeline directory.
|
||||
pub(super) enum Discovered {
|
||||
/// The only one we care about
|
||||
Layer(LayerFileName, u64),
|
||||
/// Old ephmeral files from previous launches, should be removed
|
||||
Ephemeral(OsString),
|
||||
/// Old temporary timeline files, unsure what these really are, should be removed
|
||||
Temporary(OsString),
|
||||
/// Temporary on-demand download files, should be removed
|
||||
TemporaryDownload(OsString),
|
||||
/// "metadata" file we persist locally and include in `index_part.json`
|
||||
Metadata,
|
||||
/// Backup file from previously future layers
|
||||
IgnoredBackup,
|
||||
/// Unrecognized, warn about these
|
||||
Unknown(OsString),
|
||||
}
|
||||
|
||||
/// Scans the timeline directory for interesting files.
|
||||
pub(super) fn scan_timeline_dir(path: &Path) -> anyhow::Result<Vec<Discovered>> {
|
||||
let mut ret = Vec::new();
|
||||
|
||||
for direntry in std::fs::read_dir(path)? {
|
||||
let direntry = direntry?;
|
||||
let direntry_path = direntry.path();
|
||||
let file_name = direntry.file_name();
|
||||
|
||||
let fname = file_name.to_string_lossy();
|
||||
|
||||
let discovered = match LayerFileName::from_str(&fname) {
|
||||
Ok(file_name) => {
|
||||
let file_size = direntry.metadata()?.len();
|
||||
Discovered::Layer(file_name, file_size)
|
||||
}
|
||||
Err(_) => {
|
||||
if fname == METADATA_FILE_NAME {
|
||||
Discovered::Metadata
|
||||
} else if fname.ends_with(".old") {
|
||||
// ignore these
|
||||
Discovered::IgnoredBackup
|
||||
} else if remote_timeline_client::is_temp_download_file(&direntry_path) {
|
||||
Discovered::TemporaryDownload(file_name)
|
||||
} else if is_ephemeral_file(&fname) {
|
||||
Discovered::Ephemeral(file_name)
|
||||
} else if is_temporary(&direntry_path) {
|
||||
Discovered::Temporary(file_name)
|
||||
} else {
|
||||
Discovered::Unknown(file_name)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
ret.push(discovered);
|
||||
}
|
||||
|
||||
Ok(ret)
|
||||
}
|
||||
|
||||
/// Decision on what to do with a layer file after considering its local and remote metadata.
|
||||
#[derive(Clone)]
|
||||
pub(super) enum Decision {
|
||||
/// The layer is not present locally.
|
||||
Evicted(LayerFileMetadata),
|
||||
/// The layer is present locally, but local metadata does not match remote; we must
|
||||
/// delete it and treat it as evicted.
|
||||
UseRemote {
|
||||
local: LayerFileMetadata,
|
||||
remote: LayerFileMetadata,
|
||||
},
|
||||
/// The layer is present locally, and metadata matches.
|
||||
UseLocal(LayerFileMetadata),
|
||||
/// The layer is only known locally, it needs to be uploaded.
|
||||
NeedsUpload(LayerFileMetadata),
|
||||
}
|
||||
|
||||
/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
|
||||
#[derive(Debug)]
|
||||
pub(super) struct FutureLayer {
|
||||
/// The local metadata. `None` if the layer is only known through [`IndexPart`].
|
||||
pub(super) local: Option<LayerFileMetadata>,
|
||||
}
|
||||
|
||||
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
|
||||
///
|
||||
/// This function should not gain additional reasons to fail than [`FutureLayer`], consider adding
|
||||
/// the checks earlier to [`scan_timeline_dir`].
|
||||
pub(super) fn reconcile(
|
||||
discovered: Vec<(LayerFileName, u64)>,
|
||||
index_part: Option<&IndexPart>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
generation: Generation,
|
||||
) -> Vec<(LayerFileName, Result<Decision, FutureLayer>)> {
|
||||
use Decision::*;
|
||||
|
||||
// name => (local, remote)
|
||||
type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
|
||||
|
||||
let mut discovered = discovered
|
||||
.into_iter()
|
||||
.map(|(name, file_size)| {
|
||||
(
|
||||
name,
|
||||
// The generation here will be corrected to match IndexPart in the merge below, unless
|
||||
// it is not in IndexPart, in which case using our current generation makes sense
|
||||
// because it will be uploaded in this generation.
|
||||
(Some(LayerFileMetadata::new(file_size, generation)), None),
|
||||
)
|
||||
})
|
||||
.collect::<Collected>();
|
||||
|
||||
// merge any index_part information, when available
|
||||
index_part
|
||||
.as_ref()
|
||||
.map(|ip| ip.layer_metadata.iter())
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
|
||||
.for_each(|(name, metadata)| {
|
||||
if let Some(existing) = discovered.get_mut(name) {
|
||||
existing.1 = Some(metadata);
|
||||
} else {
|
||||
discovered.insert(name.to_owned(), (None, Some(metadata)));
|
||||
}
|
||||
});
|
||||
|
||||
discovered
|
||||
.into_iter()
|
||||
.map(|(name, (local, remote))| {
|
||||
let decision = if name.is_in_future(disk_consistent_lsn) {
|
||||
Err(FutureLayer { local })
|
||||
} else {
|
||||
Ok(match (local, remote) {
|
||||
(Some(local), Some(remote)) if local != remote => {
|
||||
assert_eq!(local.generation, remote.generation);
|
||||
|
||||
UseRemote { local, remote }
|
||||
}
|
||||
(Some(x), Some(_)) => UseLocal(x),
|
||||
(None, Some(x)) => Evicted(x),
|
||||
(Some(x), None) => NeedsUpload(x),
|
||||
(None, None) => {
|
||||
unreachable!("there must not be any non-local non-remote files")
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
(name, decision)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
pub(super) fn cleanup(path: &Path, kind: &str) -> anyhow::Result<()> {
|
||||
let file_name = path.file_name().expect("must be file path");
|
||||
tracing::debug!(kind, ?file_name, "cleaning up");
|
||||
std::fs::remove_file(path)
|
||||
.with_context(|| format!("failed to remove {kind} at {}", path.display()))
|
||||
}
|
||||
|
||||
pub(super) fn cleanup_local_file_for_remote(
|
||||
path: &Path,
|
||||
local: &LayerFileMetadata,
|
||||
remote: &LayerFileMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
let local_size = local.file_size();
|
||||
let remote_size = remote.file_size();
|
||||
|
||||
let file_name = path.file_name().expect("must be file path");
|
||||
tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
|
||||
if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
|
||||
assert!(
|
||||
path.exists(),
|
||||
"we would leave the local_layer without a file if this does not hold: {}",
|
||||
path.display()
|
||||
);
|
||||
Err(err)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn cleanup_future_layer(
|
||||
path: &Path,
|
||||
name: &LayerFileName,
|
||||
disk_consistent_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
use LayerFileName::*;
|
||||
let kind = match name {
|
||||
Delta(_) => "delta",
|
||||
Image(_) => "image",
|
||||
};
|
||||
// future image layers are allowed to be produced always for not yet flushed to disk
|
||||
// lsns stored in InMemoryLayer.
|
||||
tracing::info!("found future {kind} layer {name} disk_consistent_lsn is {disk_consistent_lsn}");
|
||||
crate::tenant::timeline::rename_to_backup(path)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -13,7 +13,7 @@ use crate::{
|
||||
layer_map::{BatchedUpdates, LayerMap},
|
||||
storage_layer::{
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
|
||||
PersistentLayerDesc, PersistentLayerKey,
|
||||
PersistentLayerDesc, PersistentLayerKey, RemoteLayer,
|
||||
},
|
||||
timeline::compare_arced_layers,
|
||||
},
|
||||
@@ -85,6 +85,21 @@ impl LayerManager {
|
||||
self.layer_map.next_open_layer_at = Some(next_open_layer_at);
|
||||
}
|
||||
|
||||
pub(crate) fn initialize_remote_layers(
|
||||
&mut self,
|
||||
corrupted_local_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||
remote_layers: Vec<Arc<RemoteLayer>>,
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for layer in corrupted_local_layers {
|
||||
Self::remove_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
for layer in remote_layers {
|
||||
Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
|
||||
/// called within `get_layer_for_write`.
|
||||
pub(crate) fn get_layer_for_write(
|
||||
@@ -250,6 +265,16 @@ impl LayerManager {
|
||||
mapping.insert(layer);
|
||||
}
|
||||
|
||||
/// Helper function to remove a layer into the layer map and file manager
|
||||
fn remove_historic_layer(
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
updates: &mut BatchedUpdates<'_>,
|
||||
mapping: &mut LayerFileManager,
|
||||
) {
|
||||
updates.remove_historic(layer.layer_desc());
|
||||
mapping.remove(layer);
|
||||
}
|
||||
|
||||
/// Removes the layer from local FS (if present) and from memory.
|
||||
/// Remote storage is not affected by this operation.
|
||||
fn delete_historic_layer(
|
||||
|
||||
@@ -17,7 +17,7 @@ use crate::metrics::{
|
||||
WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
|
||||
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
|
||||
};
|
||||
use crate::task_mgr::{shutdown_token, TaskKind};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
@@ -31,11 +31,10 @@ use storage_broker::Streaming;
|
||||
use tokio::select;
|
||||
use tracing::*;
|
||||
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
use utils::backoff::{
|
||||
exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use utils::postgres_client::wal_stream_connection_config;
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -212,14 +211,11 @@ async fn subscribe_for_timeline_updates(
|
||||
id: TenantTimelineId,
|
||||
) -> Streaming<SafekeeperTimelineInfo> {
|
||||
let mut attempt = 0;
|
||||
let cancel = shutdown_token();
|
||||
|
||||
loop {
|
||||
exponential_backoff(
|
||||
attempt,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
attempt += 1;
|
||||
@@ -880,6 +876,33 @@ impl ReconnectReason {
|
||||
}
|
||||
}
|
||||
|
||||
fn wal_stream_connection_config(
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: TenantTimelineId,
|
||||
listen_pg_addr_str: &str,
|
||||
auth_token: Option<&str>,
|
||||
availability_zone: Option<&str>,
|
||||
) -> anyhow::Result<PgConnectionConfig> {
|
||||
let (host, port) =
|
||||
parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
|
||||
let port = port.unwrap_or(5432);
|
||||
let mut connstr = PgConnectionConfig::new_host_port(host, port)
|
||||
.extend_options([
|
||||
"-c".to_owned(),
|
||||
format!("timeline_id={}", timeline_id),
|
||||
format!("tenant_id={}", tenant_id),
|
||||
])
|
||||
.set_password(auth_token.map(|s| s.to_owned()));
|
||||
|
||||
if let Some(availability_zone) = availability_zone {
|
||||
connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
|
||||
}
|
||||
|
||||
Ok(connstr)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -895,7 +918,6 @@ mod tests {
|
||||
timeline: SafekeeperTimelineInfo {
|
||||
safekeeper_id: 0,
|
||||
tenant_timeline_id: None,
|
||||
term: 0,
|
||||
last_log_term: 0,
|
||||
flush_lsn: 0,
|
||||
commit_lsn,
|
||||
@@ -904,7 +926,6 @@ mod tests {
|
||||
peer_horizon_lsn: 0,
|
||||
local_start_lsn: 0,
|
||||
safekeeper_connstr: safekeeper_connstr.to_owned(),
|
||||
http_connstr: safekeeper_connstr.to_owned(),
|
||||
availability_zone: None,
|
||||
},
|
||||
latest_update,
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use crate::metrics::RemoteOpFileKind;
|
||||
|
||||
use super::storage_layer::LayerFileName;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
@@ -60,6 +62,7 @@ pub(crate) struct UploadQueueInitialized {
|
||||
// Breakdown of different kinds of tasks currently in-progress
|
||||
pub(crate) num_inprogress_layer_uploads: usize,
|
||||
pub(crate) num_inprogress_metadata_uploads: usize,
|
||||
pub(crate) num_inprogress_deletions: usize,
|
||||
|
||||
/// Tasks that are currently in-progress. In-progress means that a tokio Task
|
||||
/// has been launched for it. An in-progress task can be busy uploading, but it can
|
||||
@@ -117,6 +120,7 @@ impl UploadQueue {
|
||||
task_counter: 0,
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::new(),
|
||||
queued_operations: VecDeque::new(),
|
||||
};
|
||||
@@ -144,20 +148,22 @@ impl UploadQueue {
|
||||
);
|
||||
}
|
||||
|
||||
let index_part_metadata = index_part.parse_metadata()?;
|
||||
info!(
|
||||
"initializing upload queue with remote index_part.disk_consistent_lsn: {}",
|
||||
index_part.metadata.disk_consistent_lsn()
|
||||
index_part_metadata.disk_consistent_lsn()
|
||||
);
|
||||
|
||||
let state = UploadQueueInitialized {
|
||||
latest_files: files,
|
||||
latest_files_changes_since_metadata_upload_scheduled: 0,
|
||||
latest_metadata: index_part.metadata.clone(),
|
||||
last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
|
||||
latest_metadata: index_part_metadata.clone(),
|
||||
last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
|
||||
// what follows are boring default initializations
|
||||
task_counter: 0,
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::new(),
|
||||
queued_operations: VecDeque::new(),
|
||||
};
|
||||
@@ -195,6 +201,13 @@ pub(crate) struct UploadTask {
|
||||
pub(crate) op: UploadOp,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Delete {
|
||||
pub(crate) file_kind: RemoteOpFileKind,
|
||||
pub(crate) layer_file_name: LayerFileName,
|
||||
pub(crate) scheduled_from_timeline_delete: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum UploadOp {
|
||||
/// Upload a layer file
|
||||
@@ -203,6 +216,9 @@ pub(crate) enum UploadOp {
|
||||
/// Upload the metadata file
|
||||
UploadMetadata(IndexPart, Lsn),
|
||||
|
||||
/// Delete a layer file
|
||||
Delete(Delete),
|
||||
|
||||
/// Barrier. When the barrier operation is reached,
|
||||
Barrier(tokio::sync::watch::Sender<()>),
|
||||
}
|
||||
@@ -218,9 +234,13 @@ impl std::fmt::Display for UploadOp {
|
||||
metadata.file_size()
|
||||
)
|
||||
}
|
||||
UploadOp::UploadMetadata(_, lsn) => {
|
||||
write!(f, "UploadMetadata(lsn: {})", lsn)
|
||||
}
|
||||
UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
|
||||
UploadOp::Delete(delete) => write!(
|
||||
f,
|
||||
"Delete(path: {}, scheduled_from_timeline_delete: {})",
|
||||
delete.layer_file_name.file_name(),
|
||||
delete.scheduled_from_timeline_delete
|
||||
),
|
||||
UploadOp::Barrier(_) => write!(f, "Barrier"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
-bash: scripts/pytest: No such file or directory
|
||||
@@ -408,9 +408,9 @@ async fn connect_to_compute_once(
|
||||
let (tx, mut rx) = tokio::sync::watch::channel(session);
|
||||
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
let span = info_span!(parent: None, "connection", %conn_id);
|
||||
let span = info_span!(parent: None, "connection", %conn_info, %conn_id);
|
||||
span.in_scope(|| {
|
||||
info!(%conn_info, %session, "new connection");
|
||||
info!(%session, "new connection");
|
||||
});
|
||||
|
||||
tokio::spawn(
|
||||
@@ -420,28 +420,26 @@ async fn connect_to_compute_once(
|
||||
info!(%session, "changed session");
|
||||
}
|
||||
|
||||
loop {
|
||||
let message = ready!(connection.poll_message(cx));
|
||||
let message = ready!(connection.poll_message(cx));
|
||||
|
||||
match message {
|
||||
Some(Ok(AsyncMessage::Notice(notice))) => {
|
||||
info!(%session, "notice: {}", notice);
|
||||
}
|
||||
Some(Ok(AsyncMessage::Notification(notif))) => {
|
||||
warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
|
||||
}
|
||||
Some(Ok(_)) => {
|
||||
warn!(%session, "unknown message");
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
error!(%session, "connection error: {}", e);
|
||||
return Poll::Ready(())
|
||||
}
|
||||
None => {
|
||||
info!("connection closed");
|
||||
return Poll::Ready(())
|
||||
}
|
||||
match message {
|
||||
Some(Ok(AsyncMessage::Notice(notice))) => {
|
||||
info!(%session, "notice: {}", notice);
|
||||
Poll::Pending
|
||||
}
|
||||
Some(Ok(AsyncMessage::Notification(notif))) => {
|
||||
warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
|
||||
Poll::Pending
|
||||
}
|
||||
Some(Ok(_)) => {
|
||||
warn!(%session, "unknown message");
|
||||
Poll::Pending
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
error!(%session, "connection error: {}", e);
|
||||
Poll::Ready(())
|
||||
}
|
||||
None => Poll::Ready(()),
|
||||
}
|
||||
})
|
||||
.instrument(span)
|
||||
|
||||
@@ -341,35 +341,21 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
|
||||
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
|
||||
|
||||
// Load all timelines from disk to memory.
|
||||
GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
|
||||
|
||||
// Keep handles to main tasks to die if any of them disappears.
|
||||
let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
|
||||
FuturesUnordered::new();
|
||||
|
||||
// Start wal backup launcher before loading timelines as we'll notify it
|
||||
// through the channel about timelines which need offloading, not draining
|
||||
// the channel would cause deadlock.
|
||||
let current_thread_rt = conf
|
||||
.current_thread_runtime
|
||||
.then(|| Handle::try_current().expect("no runtime in main"));
|
||||
let conf_ = conf.clone();
|
||||
let wal_backup_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
|
||||
.spawn(wal_backup::wal_backup_launcher_task_main(
|
||||
conf_,
|
||||
wal_backup_launcher_rx,
|
||||
))
|
||||
.map(|res| ("WAL backup launcher".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_backup_handle));
|
||||
|
||||
// Load all timelines from disk to memory.
|
||||
GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?;
|
||||
|
||||
let conf_ = conf.clone();
|
||||
// Run everything in current thread rt, if asked.
|
||||
if conf.current_thread_runtime {
|
||||
info!("running in current thread runtime");
|
||||
}
|
||||
let current_thread_rt = conf
|
||||
.current_thread_runtime
|
||||
.then(|| Handle::try_current().expect("no runtime in main"));
|
||||
|
||||
let wal_service_handle = current_thread_rt
|
||||
.as_ref()
|
||||
@@ -422,6 +408,17 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
.map(|res| ("WAL remover".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_remover_handle));
|
||||
|
||||
let conf_ = conf.clone();
|
||||
let wal_backup_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
|
||||
.spawn(wal_backup::wal_backup_launcher_task_main(
|
||||
conf_,
|
||||
wal_backup_launcher_rx,
|
||||
))
|
||||
.map(|res| ("WAL backup launcher".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_backup_handle));
|
||||
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
|
||||
// TODO: update tokio-stream, convert to real async Stream with
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//! Code to deal with safekeeper control file upgrades
|
||||
use crate::safekeeper::{
|
||||
AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermLsn,
|
||||
AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory,
|
||||
TermSwitchEntry,
|
||||
};
|
||||
use anyhow::{bail, Result};
|
||||
use pq_proto::SystemId;
|
||||
@@ -144,7 +145,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
||||
let oldstate = SafeKeeperStateV1::des(&buf[..buf.len()])?;
|
||||
let ac = AcceptorState {
|
||||
term: oldstate.acceptor_state.term,
|
||||
term_history: TermHistory(vec![TermLsn {
|
||||
term_history: TermHistory(vec![TermSwitchEntry {
|
||||
term: oldstate.acceptor_state.epoch,
|
||||
lsn: Lsn(0),
|
||||
}]),
|
||||
|
||||
@@ -19,7 +19,6 @@ use crate::receive_wal::WalReceiverState;
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::safekeeper::Term;
|
||||
use crate::send_wal::WalSenderState;
|
||||
use crate::timeline::PeerInfo;
|
||||
use crate::{debug_dump, pull_timeline};
|
||||
|
||||
use crate::timelines_global_map::TimelineDeleteForceResult;
|
||||
@@ -102,7 +101,6 @@ pub struct TimelineStatus {
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
pub peers: Vec<PeerInfo>,
|
||||
pub walsenders: Vec<WalSenderState>,
|
||||
pub walreceivers: Vec<WalReceiverState>,
|
||||
}
|
||||
@@ -142,7 +140,6 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
term_history,
|
||||
};
|
||||
|
||||
let conf = get_conf(&request);
|
||||
// Note: we report in memory values which can be lost.
|
||||
let status = TimelineStatus {
|
||||
tenant_id: ttid.tenant_id,
|
||||
@@ -156,7 +153,6 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
backup_lsn: inmem.backup_lsn,
|
||||
peer_horizon_lsn: inmem.peer_horizon_lsn,
|
||||
remote_consistent_lsn: tli.get_walsenders().get_remote_consistent_lsn(),
|
||||
peers: tli.get_peers(conf).await,
|
||||
walsenders: tli.get_walsenders().get_all(),
|
||||
walreceivers: tli.get_walreceivers().get_all(),
|
||||
};
|
||||
@@ -286,14 +282,12 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
|
||||
tenant_id: ttid.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: ttid.timeline_id.as_ref().to_owned(),
|
||||
}),
|
||||
term: sk_info.term.unwrap_or(0),
|
||||
last_log_term: sk_info.last_log_term.unwrap_or(0),
|
||||
flush_lsn: sk_info.flush_lsn.0,
|
||||
commit_lsn: sk_info.commit_lsn.0,
|
||||
remote_consistent_lsn: sk_info.remote_consistent_lsn.0,
|
||||
peer_horizon_lsn: sk_info.peer_horizon_lsn.0,
|
||||
safekeeper_connstr: sk_info.safekeeper_connstr.unwrap_or_else(|| "".to_owned()),
|
||||
http_connstr: sk_info.http_connstr.unwrap_or_else(|| "".to_owned()),
|
||||
backup_lsn: sk_info.backup_lsn.0,
|
||||
local_start_lsn: sk_info.local_start_lsn.0,
|
||||
availability_zone: None,
|
||||
|
||||
@@ -21,7 +21,7 @@ use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
|
||||
use crate::safekeeper::{
|
||||
AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected,
|
||||
};
|
||||
use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermLsn};
|
||||
use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry};
|
||||
use crate::timeline::Timeline;
|
||||
use crate::GlobalTimelines;
|
||||
use postgres_backend::PostgresBackend;
|
||||
@@ -119,7 +119,7 @@ async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> any
|
||||
let history = tli.get_state().await.1.acceptor_state.term_history;
|
||||
let history = history.up_to(lsn.checked_sub(1u64).unwrap());
|
||||
let mut history_entries = history.0;
|
||||
history_entries.push(TermLsn { term, lsn });
|
||||
history_entries.push(TermSwitchEntry { term, lsn });
|
||||
let history = TermHistory(history_entries);
|
||||
|
||||
let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {
|
||||
|
||||
@@ -19,7 +19,6 @@ pub mod json_ctrl;
|
||||
pub mod metrics;
|
||||
pub mod pull_timeline;
|
||||
pub mod receive_wal;
|
||||
pub mod recovery;
|
||||
pub mod remove_wal;
|
||||
pub mod safekeeper;
|
||||
pub mod send_wal;
|
||||
|
||||
@@ -227,9 +227,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
|
||||
tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
|
||||
tokio::fs::rename(tli_dir_path, &timeline_path).await?;
|
||||
|
||||
let tli = GlobalTimelines::load_timeline(ttid)
|
||||
.await
|
||||
.context("Failed to load timeline after copy")?;
|
||||
let tli = GlobalTimelines::load_timeline(ttid).context("Failed to load timeline after copy")?;
|
||||
|
||||
info!(
|
||||
"Loaded timeline {}, flush_lsn={}",
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
//! This module implements pulling WAL from peer safekeepers if compute can't
|
||||
//! provide it, i.e. safekeeper lags too much.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio::{select, time::sleep, time::Duration};
|
||||
use tracing::{info, instrument};
|
||||
|
||||
use crate::{timeline::Timeline, SafeKeeperConf};
|
||||
|
||||
/// Entrypoint for per timeline task which always runs, checking whether
|
||||
/// recovery for this safekeeper is needed and starting it if so.
|
||||
#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn recovery_main(tli: Arc<Timeline>, _conf: SafeKeeperConf) {
|
||||
info!("started");
|
||||
let mut cancellation_rx = match tli.get_cancellation_rx() {
|
||||
Ok(rx) => rx,
|
||||
Err(_) => {
|
||||
info!("timeline canceled during task start");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
select! {
|
||||
_ = recovery_main_loop(tli) => { unreachable!() }
|
||||
_ = cancellation_rx.changed() => {
|
||||
info!("stopped");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const CHECK_INTERVAL_MS: u64 = 2000;
|
||||
|
||||
/// Check regularly whether we need to start recovery.
|
||||
async fn recovery_main_loop(_tli: Arc<Timeline>) {
|
||||
let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
|
||||
loop {
|
||||
sleep(check_duration).await;
|
||||
}
|
||||
}
|
||||
@@ -34,33 +34,22 @@ pub const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
|
||||
/// Consensus logical timestamp.
|
||||
pub type Term = u64;
|
||||
pub const INVALID_TERM: Term = 0;
|
||||
const INVALID_TERM: Term = 0;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct TermLsn {
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct TermSwitchEntry {
|
||||
pub term: Term,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
// Creation from tuple provides less typing (e.g. for unit tests).
|
||||
impl From<(Term, Lsn)> for TermLsn {
|
||||
fn from(pair: (Term, Lsn)) -> TermLsn {
|
||||
TermLsn {
|
||||
term: pair.0,
|
||||
lsn: pair.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct TermHistory(pub Vec<TermLsn>);
|
||||
pub struct TermHistory(pub Vec<TermSwitchEntry>);
|
||||
|
||||
impl TermHistory {
|
||||
pub fn empty() -> TermHistory {
|
||||
TermHistory(Vec::new())
|
||||
}
|
||||
|
||||
// Parse TermHistory as n_entries followed by TermLsn pairs
|
||||
// Parse TermHistory as n_entries followed by TermSwitchEntry pairs
|
||||
pub fn from_bytes(bytes: &mut Bytes) -> Result<TermHistory> {
|
||||
if bytes.remaining() < 4 {
|
||||
bail!("TermHistory misses len");
|
||||
@@ -71,7 +60,7 @@ impl TermHistory {
|
||||
if bytes.remaining() < 16 {
|
||||
bail!("TermHistory is incomplete");
|
||||
}
|
||||
res.push(TermLsn {
|
||||
res.push(TermSwitchEntry {
|
||||
term: bytes.get_u64_le(),
|
||||
lsn: bytes.get_u64_le().into(),
|
||||
})
|
||||
@@ -568,17 +557,12 @@ where
|
||||
.up_to(self.flush_lsn())
|
||||
}
|
||||
|
||||
/// Get current term.
|
||||
pub fn get_term(&self) -> Term {
|
||||
self.state.acceptor_state.term
|
||||
}
|
||||
|
||||
pub fn get_epoch(&self) -> Term {
|
||||
self.state.acceptor_state.get_epoch(self.flush_lsn())
|
||||
}
|
||||
|
||||
/// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet.
|
||||
pub fn flush_lsn(&self) -> Lsn {
|
||||
fn flush_lsn(&self) -> Lsn {
|
||||
max(self.wal_store.flush_lsn(), self.state.timeline_start_lsn)
|
||||
}
|
||||
|
||||
@@ -1154,7 +1138,7 @@ mod tests {
|
||||
let pem = ProposerElected {
|
||||
term: 1,
|
||||
start_streaming_at: Lsn(1),
|
||||
term_history: TermHistory(vec![TermLsn {
|
||||
term_history: TermHistory(vec![TermSwitchEntry {
|
||||
term: 1,
|
||||
lsn: Lsn(3),
|
||||
}]),
|
||||
|
||||
@@ -2,12 +2,12 @@
|
||||
//! with the "START_REPLICATION" message, and registry of walsenders.
|
||||
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::safekeeper::{Term, TermLsn};
|
||||
use crate::safekeeper::Term;
|
||||
use crate::timeline::Timeline;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::wal_storage::WalReader;
|
||||
use crate::GlobalTimelines;
|
||||
use anyhow::{bail, Context as AnyhowContext};
|
||||
use anyhow::Context as AnyhowContext;
|
||||
use bytes::Bytes;
|
||||
use parking_lot::Mutex;
|
||||
use postgres_backend::PostgresBackend;
|
||||
@@ -390,25 +390,26 @@ impl SafekeeperPostgresHandler {
|
||||
self.appname.clone(),
|
||||
));
|
||||
|
||||
// Walsender can operate in one of two modes which we select by
|
||||
// application_name: give only committed WAL (used by pageserver) or all
|
||||
// existing WAL (up to flush_lsn, used by walproposer or peer recovery).
|
||||
// The second case is always driven by a consensus leader which term
|
||||
// must generally be also supplied. However we're sloppy to do this in
|
||||
// walproposer recovery which will be removed soon. So TODO is to make
|
||||
// it not Option'al then.
|
||||
let commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx();
|
||||
|
||||
// Walproposer gets special handling: safekeeper must give proposer all
|
||||
// local WAL till the end, whether committed or not (walproposer will
|
||||
// hang otherwise). That's because walproposer runs the consensus and
|
||||
// synchronizes safekeepers on the most advanced one.
|
||||
//
|
||||
// Fetching WAL without term in recovery creates a small risk of this
|
||||
// WAL getting concurrently garbaged if another compute rises which
|
||||
// collects majority and starts fixing log on this safekeeper itself.
|
||||
// That's ok as (old) proposer will never be able to commit such WAL.
|
||||
let end_watch = if self.is_walproposer_recovery() {
|
||||
EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
|
||||
// There is a small risk of this WAL getting concurrently garbaged if
|
||||
// another compute rises which collects majority and starts fixing log
|
||||
// on this safekeeper itself. That's ok as (old) proposer will never be
|
||||
// able to commit such WAL.
|
||||
let stop_pos: Option<Lsn> = if self.is_walproposer_recovery() {
|
||||
let wal_end = tli.get_flush_lsn().await;
|
||||
Some(wal_end)
|
||||
} else {
|
||||
EndWatch::Commit(tli.get_commit_lsn_watch_rx())
|
||||
None
|
||||
};
|
||||
// we don't check term here; it will be checked on first waiting/WAL reading anyway.
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
// take the latest commit_lsn if don't have stop_pos
|
||||
let end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());
|
||||
|
||||
if end_pos < start_pos {
|
||||
warn!(
|
||||
@@ -418,10 +419,8 @@ impl SafekeeperPostgresHandler {
|
||||
}
|
||||
|
||||
info!(
|
||||
"starting streaming from {:?}, available WAL ends at {}, recovery={}",
|
||||
start_pos,
|
||||
end_pos,
|
||||
matches!(end_watch, EndWatch::Flush(_))
|
||||
"starting streaming from {:?} till {:?}, available WAL ends at {}",
|
||||
start_pos, stop_pos, end_pos
|
||||
);
|
||||
|
||||
// switch to copy
|
||||
@@ -446,8 +445,9 @@ impl SafekeeperPostgresHandler {
|
||||
appname,
|
||||
start_pos,
|
||||
end_pos,
|
||||
stop_pos,
|
||||
term,
|
||||
end_watch,
|
||||
commit_lsn_watch_rx,
|
||||
ws_guard: ws_guard.clone(),
|
||||
wal_reader,
|
||||
send_buf: [0; MAX_SEND_SIZE],
|
||||
@@ -466,32 +466,6 @@ impl SafekeeperPostgresHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Walsender streams either up to commit_lsn (normally) or flush_lsn in the
|
||||
/// given term (recovery by walproposer or peer safekeeper).
|
||||
enum EndWatch {
|
||||
Commit(Receiver<Lsn>),
|
||||
Flush(Receiver<TermLsn>),
|
||||
}
|
||||
|
||||
impl EndWatch {
|
||||
/// Get current end of WAL.
|
||||
fn get(&self) -> Lsn {
|
||||
match self {
|
||||
EndWatch::Commit(r) => *r.borrow(),
|
||||
EndWatch::Flush(r) => r.borrow().lsn,
|
||||
}
|
||||
}
|
||||
|
||||
/// Wait for the update.
|
||||
async fn changed(&mut self) -> anyhow::Result<()> {
|
||||
match self {
|
||||
EndWatch::Commit(r) => r.changed().await?,
|
||||
EndWatch::Flush(r) => r.changed().await?,
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A half driving sending WAL.
|
||||
struct WalSender<'a, IO> {
|
||||
pgb: &'a mut PostgresBackend<IO>,
|
||||
@@ -506,12 +480,14 @@ struct WalSender<'a, IO> {
|
||||
// We send this LSN to the receiver as wal_end, so that it knows how much
|
||||
// WAL this safekeeper has. This LSN should be as fresh as possible.
|
||||
end_pos: Lsn,
|
||||
// If present, terminate after reaching this position; used by walproposer
|
||||
// in recovery.
|
||||
stop_pos: Option<Lsn>,
|
||||
/// When streaming uncommitted part, the term the client acts as the leader
|
||||
/// in. Streaming is stopped if local term changes to a different (higher)
|
||||
/// value.
|
||||
term: Option<Term>,
|
||||
/// Watch channel receiver to learn end of available WAL (and wait for its advancement).
|
||||
end_watch: EndWatch,
|
||||
commit_lsn_watch_rx: Receiver<Lsn>,
|
||||
ws_guard: Arc<WalSenderGuard>,
|
||||
wal_reader: WalReader,
|
||||
// buffer for readling WAL into to send it
|
||||
@@ -521,20 +497,29 @@ struct WalSender<'a, IO> {
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
/// Send WAL until
|
||||
/// - an error occurs
|
||||
/// - receiver is caughtup and there is no computes (if streaming up to commit_lsn)
|
||||
/// - if we are streaming to walproposer, we've streamed until stop_pos
|
||||
/// (recovery finished)
|
||||
/// - receiver is caughtup and there is no computes
|
||||
///
|
||||
/// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
|
||||
/// convenience.
|
||||
async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
loop {
|
||||
// Wait for the next portion if it is not there yet, or just
|
||||
// update our end of WAL available for sending value, we
|
||||
// communicate it to the receiver.
|
||||
self.wait_wal().await?;
|
||||
assert!(
|
||||
self.end_pos > self.start_pos,
|
||||
"nothing to send after waiting for WAL"
|
||||
);
|
||||
// If we are streaming to walproposer, check it is time to stop.
|
||||
if let Some(stop_pos) = self.stop_pos {
|
||||
if self.start_pos >= stop_pos {
|
||||
// recovery finished
|
||||
return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
|
||||
"ending streaming to walproposer at {}, recovery finished",
|
||||
self.start_pos
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
// Wait for the next portion if it is not there yet, or just
|
||||
// update our end of WAL available for sending value, we
|
||||
// communicate it to the receiver.
|
||||
self.wait_wal().await?;
|
||||
}
|
||||
|
||||
// try to send as much as available, capped by MAX_SEND_SIZE
|
||||
let mut send_size = self
|
||||
@@ -582,7 +567,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
/// exit in the meanwhile
|
||||
async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
loop {
|
||||
self.end_pos = self.end_watch.get();
|
||||
self.end_pos = *self.commit_lsn_watch_rx.borrow();
|
||||
if self.end_pos > self.start_pos {
|
||||
// We have something to send.
|
||||
trace!("got end_pos {:?}, streaming", self.end_pos);
|
||||
@@ -590,31 +575,27 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
}
|
||||
|
||||
// Wait for WAL to appear, now self.end_pos == self.start_pos.
|
||||
if let Some(lsn) = wait_for_lsn(&mut self.end_watch, self.term, self.start_pos).await? {
|
||||
if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? {
|
||||
self.end_pos = lsn;
|
||||
trace!("got end_pos {:?}, streaming", self.end_pos);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Timed out waiting for WAL, check for termination and send KA.
|
||||
// Check for termination only if we are streaming up to commit_lsn
|
||||
// (to pageserver).
|
||||
if let EndWatch::Commit(_) = self.end_watch {
|
||||
if let Some(remote_consistent_lsn) = self
|
||||
.ws_guard
|
||||
.walsenders
|
||||
.get_ws_remote_consistent_lsn(self.ws_guard.id)
|
||||
{
|
||||
if self.tli.should_walsender_stop(remote_consistent_lsn).await {
|
||||
// Terminate if there is nothing more to send.
|
||||
// Note that "ending streaming" part of the string is used by
|
||||
// pageserver to identify WalReceiverError::SuccessfulCompletion,
|
||||
// do not change this string without updating pageserver.
|
||||
return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
|
||||
// Timed out waiting for WAL, check for termination and send KA
|
||||
if let Some(remote_consistent_lsn) = self
|
||||
.ws_guard
|
||||
.walsenders
|
||||
.get_ws_remote_consistent_lsn(self.ws_guard.id)
|
||||
{
|
||||
if self.tli.should_walsender_stop(remote_consistent_lsn).await {
|
||||
// Terminate if there is nothing more to send.
|
||||
// Note that "ending streaming" part of the string is used by
|
||||
// pageserver to identify WalReceiverError::SuccessfulCompletion,
|
||||
// do not change this string without updating pageserver.
|
||||
return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
|
||||
"ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
|
||||
self.appname, self.start_pos,
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -682,32 +663,22 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
|
||||
|
||||
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
|
||||
|
||||
/// Wait until we have available WAL > start_pos or timeout expires. Returns
|
||||
/// - Ok(Some(end_pos)) if needed lsn is successfully observed;
|
||||
/// Wait until we have commit_lsn > lsn or timeout expires. Returns
|
||||
/// - Ok(Some(commit_lsn)) if needed lsn is successfully observed;
|
||||
/// - Ok(None) if timeout expired;
|
||||
/// - Err in case of error -- only if 1) term changed while fetching in recovery
|
||||
/// mode 2) watch channel closed, which must never happen.
|
||||
async fn wait_for_lsn(
|
||||
rx: &mut EndWatch,
|
||||
client_term: Option<Term>,
|
||||
start_pos: Lsn,
|
||||
) -> anyhow::Result<Option<Lsn>> {
|
||||
/// - Err in case of error (if watch channel is in trouble, shouldn't happen).
|
||||
async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> anyhow::Result<Option<Lsn>> {
|
||||
let res = timeout(POLL_STATE_TIMEOUT, async move {
|
||||
let mut commit_lsn;
|
||||
loop {
|
||||
let end_pos = rx.get();
|
||||
if end_pos > start_pos {
|
||||
return Ok(end_pos);
|
||||
}
|
||||
if let EndWatch::Flush(rx) = rx {
|
||||
let curr_term = rx.borrow().term;
|
||||
if let Some(client_term) = client_term {
|
||||
if curr_term != client_term {
|
||||
bail!("term changed: requested {}, now {}", client_term, curr_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
rx.changed().await?;
|
||||
commit_lsn = *rx.borrow();
|
||||
if commit_lsn > lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(commit_lsn)
|
||||
})
|
||||
.await;
|
||||
|
||||
|
||||
@@ -3,11 +3,8 @@
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::serde_as;
|
||||
use tokio::fs;
|
||||
|
||||
use serde_with::DisplayFromStr;
|
||||
use std::cmp::max;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
@@ -27,10 +24,9 @@ use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::recovery::recovery_main;
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
|
||||
SafekeeperMemState, ServerInfo, Term, TermLsn, INVALID_TERM,
|
||||
SafekeeperMemState, ServerInfo, Term,
|
||||
};
|
||||
use crate::send_wal::WalSenders;
|
||||
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
||||
@@ -41,25 +37,18 @@ use crate::SafeKeeperConf;
|
||||
use crate::{debug_dump, wal_storage};
|
||||
|
||||
/// Things safekeeper should know about timeline state on peers.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PeerInfo {
|
||||
pub sk_id: NodeId,
|
||||
/// Term of the last entry.
|
||||
_last_log_term: Term,
|
||||
/// LSN of the last record.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
_flush_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub commit_lsn: Lsn,
|
||||
/// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
|
||||
/// sk since backup_lsn.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub local_start_lsn: Lsn,
|
||||
/// When info was received. Serde annotations are not very useful but make
|
||||
/// the code compile -- we don't rely on this field externally.
|
||||
#[serde(skip)]
|
||||
#[serde(default = "Instant::now")]
|
||||
/// When info was received.
|
||||
ts: Instant,
|
||||
}
|
||||
|
||||
@@ -248,9 +237,8 @@ impl SharedState {
|
||||
tenant_id: ttid.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: ttid.timeline_id.as_ref().to_owned(),
|
||||
}),
|
||||
term: self.sk.state.acceptor_state.term,
|
||||
last_log_term: self.sk.get_epoch(),
|
||||
flush_lsn: self.sk.flush_lsn().0,
|
||||
flush_lsn: self.sk.wal_store.flush_lsn().0,
|
||||
// note: this value is not flushed to control file yet and can be lost
|
||||
commit_lsn: self.sk.inmem.commit_lsn.0,
|
||||
remote_consistent_lsn: remote_consistent_lsn.0,
|
||||
@@ -259,7 +247,6 @@ impl SharedState {
|
||||
.advertise_pg_addr
|
||||
.to_owned()
|
||||
.unwrap_or(conf.listen_pg_addr.clone()),
|
||||
http_connstr: conf.listen_http_addr.to_owned(),
|
||||
backup_lsn: self.sk.inmem.backup_lsn.0,
|
||||
local_start_lsn: self.sk.state.local_start_lsn.0,
|
||||
availability_zone: conf.availability_zone.clone(),
|
||||
@@ -309,13 +296,6 @@ pub struct Timeline {
|
||||
commit_lsn_watch_tx: watch::Sender<Lsn>,
|
||||
commit_lsn_watch_rx: watch::Receiver<Lsn>,
|
||||
|
||||
/// Broadcasts (current term, flush_lsn) updates, walsender is interested in
|
||||
/// them when sending in recovery mode (to walproposer or peers). Note: this
|
||||
/// is just a notification, WAL reading should always done with lock held as
|
||||
/// term can change otherwise.
|
||||
term_flush_lsn_watch_tx: watch::Sender<TermLsn>,
|
||||
term_flush_lsn_watch_rx: watch::Receiver<TermLsn>,
|
||||
|
||||
/// Safekeeper and other state, that should remain consistent and
|
||||
/// synchronized with the disk. This is tokio mutex as we write WAL to disk
|
||||
/// while holding it, ensuring that consensus checks are in order.
|
||||
@@ -337,20 +317,16 @@ pub struct Timeline {
|
||||
impl Timeline {
|
||||
/// Load existing timeline from disk.
|
||||
pub fn load_timeline(
|
||||
conf: &SafeKeeperConf,
|
||||
conf: SafeKeeperConf,
|
||||
ttid: TenantTimelineId,
|
||||
wal_backup_launcher_tx: Sender<TenantTimelineId>,
|
||||
) -> Result<Timeline> {
|
||||
let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
|
||||
|
||||
let shared_state = SharedState::restore(conf, &ttid)?;
|
||||
let shared_state = SharedState::restore(&conf, &ttid)?;
|
||||
let rcl = shared_state.sk.state.remote_consistent_lsn;
|
||||
let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
|
||||
watch::channel(shared_state.sk.state.commit_lsn);
|
||||
let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((
|
||||
shared_state.sk.get_term(),
|
||||
shared_state.sk.flush_lsn(),
|
||||
)));
|
||||
let (cancellation_tx, cancellation_rx) = watch::channel(false);
|
||||
|
||||
Ok(Timeline {
|
||||
@@ -358,8 +334,6 @@ impl Timeline {
|
||||
wal_backup_launcher_tx,
|
||||
commit_lsn_watch_tx,
|
||||
commit_lsn_watch_rx,
|
||||
term_flush_lsn_watch_tx,
|
||||
term_flush_lsn_watch_rx,
|
||||
mutex: Mutex::new(shared_state),
|
||||
walsenders: WalSenders::new(rcl),
|
||||
walreceivers: WalReceivers::new(),
|
||||
@@ -371,7 +345,7 @@ impl Timeline {
|
||||
|
||||
/// Create a new timeline, which is not yet persisted to disk.
|
||||
pub fn create_empty(
|
||||
conf: &SafeKeeperConf,
|
||||
conf: SafeKeeperConf,
|
||||
ttid: TenantTimelineId,
|
||||
wal_backup_launcher_tx: Sender<TenantTimelineId>,
|
||||
server_info: ServerInfo,
|
||||
@@ -379,8 +353,6 @@ impl Timeline {
|
||||
local_start_lsn: Lsn,
|
||||
) -> Result<Timeline> {
|
||||
let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
|
||||
let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
|
||||
watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
|
||||
let (cancellation_tx, cancellation_rx) = watch::channel(false);
|
||||
let state = SafeKeeperState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
|
||||
|
||||
@@ -389,9 +361,7 @@ impl Timeline {
|
||||
wal_backup_launcher_tx,
|
||||
commit_lsn_watch_tx,
|
||||
commit_lsn_watch_rx,
|
||||
term_flush_lsn_watch_tx,
|
||||
term_flush_lsn_watch_rx,
|
||||
mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
|
||||
mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?),
|
||||
walsenders: WalSenders::new(Lsn(0)),
|
||||
walreceivers: WalReceivers::new(),
|
||||
cancellation_rx,
|
||||
@@ -400,16 +370,12 @@ impl Timeline {
|
||||
})
|
||||
}
|
||||
|
||||
/// Initialize fresh timeline on disk and start background tasks. If init
|
||||
/// Initialize fresh timeline on disk and start background tasks. If bootstrap
|
||||
/// fails, timeline is cancelled and cannot be used anymore.
|
||||
///
|
||||
/// Init is transactional, so if it fails, created files will be deleted,
|
||||
/// Bootstrap is transactional, so if it fails, created files will be deleted,
|
||||
/// and state on disk should remain unchanged.
|
||||
pub async fn init_new(
|
||||
self: &Arc<Timeline>,
|
||||
shared_state: &mut MutexGuard<'_, SharedState>,
|
||||
conf: &SafeKeeperConf,
|
||||
) -> Result<()> {
|
||||
pub async fn bootstrap(&self, shared_state: &mut MutexGuard<'_, SharedState>) -> Result<()> {
|
||||
match fs::metadata(&self.timeline_dir).await {
|
||||
Ok(_) => {
|
||||
// Timeline directory exists on disk, we should leave state unchanged
|
||||
@@ -425,7 +391,7 @@ impl Timeline {
|
||||
// Create timeline directory.
|
||||
fs::create_dir_all(&self.timeline_dir).await?;
|
||||
|
||||
// Write timeline to disk and start background tasks.
|
||||
// Write timeline to disk and TODO: start background tasks.
|
||||
if let Err(e) = shared_state.sk.persist().await {
|
||||
// Bootstrap failed, cancel timeline and remove timeline directory.
|
||||
self.cancel(shared_state);
|
||||
@@ -439,14 +405,10 @@ impl Timeline {
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
self.bootstrap(conf);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Bootstrap new or existing timeline starting background stasks.
|
||||
pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
|
||||
// Start recovery task which always runs on the timeline.
|
||||
tokio::spawn(recovery_main(self.clone(), conf.clone()));
|
||||
// TODO: add more initialization steps here
|
||||
self.update_status(shared_state);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete timeline from disk completely, by removing timeline directory. Background
|
||||
@@ -482,16 +444,6 @@ impl Timeline {
|
||||
*self.cancellation_rx.borrow()
|
||||
}
|
||||
|
||||
/// Returns watch channel which gets value when timeline is cancelled. It is
|
||||
/// guaranteed to have not cancelled value observed (errors otherwise).
|
||||
pub fn get_cancellation_rx(&self) -> Result<watch::Receiver<bool>> {
|
||||
let rx = self.cancellation_rx.clone();
|
||||
if *rx.borrow() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
Ok(rx)
|
||||
}
|
||||
|
||||
/// Take a writing mutual exclusive lock on timeline shared_state.
|
||||
pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
|
||||
self.mutex.lock().await
|
||||
@@ -568,11 +520,6 @@ impl Timeline {
|
||||
self.commit_lsn_watch_rx.clone()
|
||||
}
|
||||
|
||||
/// Returns term_flush_lsn watch channel.
|
||||
pub fn get_term_flush_lsn_watch_rx(&self) -> watch::Receiver<TermLsn> {
|
||||
self.term_flush_lsn_watch_rx.clone()
|
||||
}
|
||||
|
||||
/// Pass arrived message to the safekeeper.
|
||||
pub async fn process_msg(
|
||||
&self,
|
||||
@@ -584,7 +531,6 @@ impl Timeline {
|
||||
|
||||
let mut rmsg: Option<AcceptorProposerMessage>;
|
||||
let commit_lsn: Lsn;
|
||||
let term_flush_lsn: TermLsn;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
rmsg = shared_state.sk.process_msg(msg).await?;
|
||||
@@ -598,11 +544,8 @@ impl Timeline {
|
||||
}
|
||||
|
||||
commit_lsn = shared_state.sk.inmem.commit_lsn;
|
||||
term_flush_lsn =
|
||||
TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
|
||||
}
|
||||
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
||||
self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
|
||||
Ok(rmsg)
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tracing::*;
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
@@ -71,23 +71,19 @@ pub struct GlobalTimelines;
|
||||
|
||||
impl GlobalTimelines {
|
||||
/// Inject dependencies needed for the timeline constructors and load all timelines to memory.
|
||||
pub async fn init(
|
||||
pub fn init(
|
||||
conf: SafeKeeperConf,
|
||||
wal_backup_launcher_tx: Sender<TenantTimelineId>,
|
||||
) -> Result<()> {
|
||||
// clippy isn't smart enough to understand that drop(state) releases the
|
||||
// lock, so use explicit block
|
||||
let tenants_dir = {
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
assert!(state.wal_backup_launcher_tx.is_none());
|
||||
state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
|
||||
state.conf = Some(conf);
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
assert!(state.wal_backup_launcher_tx.is_none());
|
||||
state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
|
||||
state.conf = Some(conf);
|
||||
|
||||
// Iterate through all directories and load tenants for all directories
|
||||
// named as a valid tenant_id.
|
||||
state.get_conf().workdir.clone()
|
||||
};
|
||||
// Iterate through all directories and load tenants for all directories
|
||||
// named as a valid tenant_id.
|
||||
let mut tenant_count = 0;
|
||||
let tenants_dir = state.get_conf().workdir.clone();
|
||||
for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
|
||||
.with_context(|| format!("failed to list tenants dir {}", tenants_dir.display()))?
|
||||
{
|
||||
@@ -97,7 +93,7 @@ impl GlobalTimelines {
|
||||
TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or(""))
|
||||
{
|
||||
tenant_count += 1;
|
||||
GlobalTimelines::load_tenant_timelines(tenant_id).await?;
|
||||
GlobalTimelines::load_tenant_timelines(&mut state, tenant_id)?;
|
||||
}
|
||||
}
|
||||
Err(e) => error!(
|
||||
@@ -112,7 +108,7 @@ impl GlobalTimelines {
|
||||
info!(
|
||||
"found {} tenants directories, successfully loaded {} timelines",
|
||||
tenant_count,
|
||||
TIMELINES_STATE.lock().unwrap().timelines.len()
|
||||
state.timelines.len()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -120,21 +116,17 @@ impl GlobalTimelines {
|
||||
/// Loads all timelines for the given tenant to memory. Returns fs::read_dir
|
||||
/// errors if any.
|
||||
///
|
||||
/// It is async for update_status_notify sake. Since TIMELINES_STATE lock is
|
||||
/// sync and there is no important reason to make it async (it is always
|
||||
/// held for a short while) we just lock and unlock it for each timeline --
|
||||
/// this function is called during init when nothing else is running, so
|
||||
/// this is fine.
|
||||
async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
|
||||
let (conf, wal_backup_launcher_tx) = {
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
(
|
||||
state.get_conf().clone(),
|
||||
state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
|
||||
)
|
||||
};
|
||||
|
||||
let timelines_dir = conf.tenant_dir(&tenant_id);
|
||||
/// Note: This function (and all reading/loading below) is sync because
|
||||
/// timelines are loaded while holding GlobalTimelinesState lock. Which is
|
||||
/// fine as this is called only from single threaded main runtime on boot,
|
||||
/// but clippy complains anyway, and suppressing that isn't trivial as async
|
||||
/// is the keyword, ha. That only other user is pull_timeline.rs for which
|
||||
/// being blocked is not that bad, and we can do spawn_blocking.
|
||||
fn load_tenant_timelines(
|
||||
state: &mut MutexGuard<'_, GlobalTimelinesState>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<()> {
|
||||
let timelines_dir = state.get_conf().tenant_dir(&tenant_id);
|
||||
for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
|
||||
.with_context(|| format!("failed to list timelines dir {}", timelines_dir.display()))?
|
||||
{
|
||||
@@ -144,16 +136,13 @@ impl GlobalTimelines {
|
||||
TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
|
||||
{
|
||||
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) {
|
||||
match Timeline::load_timeline(
|
||||
state.get_conf().clone(),
|
||||
ttid,
|
||||
state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
|
||||
) {
|
||||
Ok(timeline) => {
|
||||
let tli = Arc::new(timeline);
|
||||
TIMELINES_STATE
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.insert(ttid, tli.clone());
|
||||
tli.bootstrap(&conf);
|
||||
tli.update_status_notify().await.unwrap();
|
||||
state.timelines.insert(ttid, Arc::new(timeline));
|
||||
}
|
||||
// If we can't load a timeline, it's most likely because of a corrupted
|
||||
// directory. We will log an error and won't allow to delete/recreate
|
||||
@@ -179,22 +168,18 @@ impl GlobalTimelines {
|
||||
}
|
||||
|
||||
/// Load timeline from disk to the memory.
|
||||
pub async fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
|
||||
pub fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
|
||||
let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
|
||||
|
||||
match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
|
||||
match Timeline::load_timeline(conf, ttid, wal_backup_launcher_tx) {
|
||||
Ok(timeline) => {
|
||||
let tli = Arc::new(timeline);
|
||||
|
||||
// TODO: prevent concurrent timeline creation/loading
|
||||
TIMELINES_STATE
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.insert(ttid, tli.clone());
|
||||
|
||||
tli.bootstrap(&conf);
|
||||
|
||||
Ok(tli)
|
||||
}
|
||||
// If we can't load a timeline, it's bad. Caller will figure it out.
|
||||
@@ -232,7 +217,7 @@ impl GlobalTimelines {
|
||||
info!("creating new timeline {}", ttid);
|
||||
|
||||
let timeline = Arc::new(Timeline::create_empty(
|
||||
&conf,
|
||||
conf,
|
||||
ttid,
|
||||
wal_backup_launcher_tx,
|
||||
server_info,
|
||||
@@ -255,24 +240,23 @@ impl GlobalTimelines {
|
||||
// Write the new timeline to the disk and start background workers.
|
||||
// Bootstrap is transactional, so if it fails, the timeline will be deleted,
|
||||
// and the state on disk should remain unchanged.
|
||||
if let Err(e) = timeline.init_new(&mut shared_state, &conf).await {
|
||||
// Note: the most likely reason for init failure is that the timeline
|
||||
if let Err(e) = timeline.bootstrap(&mut shared_state).await {
|
||||
// Note: the most likely reason for bootstrap failure is that the timeline
|
||||
// directory already exists on disk. This happens when timeline is corrupted
|
||||
// and wasn't loaded from disk on startup because of that. We want to preserve
|
||||
// the timeline directory in this case, for further inspection.
|
||||
|
||||
// TODO: this is an unusual error, perhaps we should send it to sentry
|
||||
// TODO: compute will try to create timeline every second, we should add backoff
|
||||
error!("failed to init new timeline {}: {}", ttid, e);
|
||||
error!("failed to bootstrap timeline {}: {}", ttid, e);
|
||||
|
||||
// Timeline failed to init, it cannot be used. Remove it from the map.
|
||||
// Timeline failed to bootstrap, it cannot be used. Remove it from the map.
|
||||
TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid);
|
||||
return Err(e);
|
||||
}
|
||||
// We are done with bootstrap, release the lock, return the timeline.
|
||||
// {} block forces release before .await
|
||||
}
|
||||
timeline.update_status_notify().await?;
|
||||
timeline.wal_backup_launcher_tx.send(timeline.ttid).await?;
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
76
scripts/combine_control_files.py
Normal file
76
scripts/combine_control_files.py
Normal file
@@ -0,0 +1,76 @@
|
||||
#! /usr/bin/env python3
|
||||
# Script to generate ext_index.json metadata file
|
||||
# that stores content of the control files and location of extension archives
|
||||
# for all extensions in extensions subdir.
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
"""
|
||||
# ext_index.json example:
|
||||
{
|
||||
"public_extensions": [
|
||||
"anon"
|
||||
],
|
||||
"library_index": {
|
||||
"anon": "anon",
|
||||
// for more complex extensions like postgis
|
||||
// we might have something like:
|
||||
// address_standardizer: postgis
|
||||
// postgis_tiger: postgis
|
||||
},
|
||||
"extension_data": {
|
||||
"anon": {
|
||||
"control_data": {
|
||||
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
},
|
||||
"archive_path": "5648391853/v15/extensions/anon.tar.zst"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="generate ext_index.json")
|
||||
parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
|
||||
parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
|
||||
parser.add_argument("--public_extensions", type=str, help="list of public extensions")
|
||||
args = parser.parse_args()
|
||||
pg_version = args.pg_version
|
||||
BUILD_TAG = args.BUILD_TAG
|
||||
public_ext_list = args.public_extensions.split(",")
|
||||
|
||||
ext_index = {}
|
||||
library_index = {}
|
||||
EXT_PATH = Path("extensions")
|
||||
for extension in EXT_PATH.iterdir():
|
||||
if extension.is_dir():
|
||||
control_data = {}
|
||||
for control_file in extension.glob("*.control"):
|
||||
if control_file.suffix != ".control":
|
||||
continue
|
||||
with open(control_file, "r") as f:
|
||||
control_data[control_file.name] = f.read()
|
||||
ext_index[extension.name] = {
|
||||
"control_data": control_data,
|
||||
"archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
|
||||
}
|
||||
elif extension.suffix == ".zst":
|
||||
file_list = (
|
||||
str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
|
||||
.strip()
|
||||
.split("\n")
|
||||
)
|
||||
for file in file_list:
|
||||
if file.endswith(".so") and file.startswith("lib/"):
|
||||
lib_name = file[4:-3]
|
||||
library_index[lib_name] = extension.name.replace(".tar.zst", "")
|
||||
|
||||
all_data = {
|
||||
"public_extensions": public_ext_list,
|
||||
"library_index": library_index,
|
||||
"extension_data": ext_index,
|
||||
}
|
||||
with open("ext_index.json", "w") as f:
|
||||
json.dump(all_data, f)
|
||||
@@ -12,26 +12,25 @@ import psycopg2.extras
|
||||
# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
|
||||
FLAKY_TESTS_QUERY = """
|
||||
SELECT
|
||||
DISTINCT parent_suite, suite, REGEXP_REPLACE(test, '(release|debug)-pg(\\d+)-?', '') as deparametrized_test
|
||||
DISTINCT parent_suite, suite, test
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
reference,
|
||||
jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
|
||||
jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'retriesStatusChange' as retries_status_change,
|
||||
to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' ->> 'start')::bigint / 1000)::date as timestamp
|
||||
revision,
|
||||
jsonb_array_elements(data -> 'children') -> 'name' as parent_suite,
|
||||
jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'retriesStatusChange' as retries_status_change,
|
||||
to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
|
||||
FROM
|
||||
regress_test_results
|
||||
WHERE
|
||||
reference = 'refs/heads/main'
|
||||
) data
|
||||
WHERE
|
||||
timestamp > CURRENT_DATE - INTERVAL '%s' day
|
||||
AND (
|
||||
(status IN ('failed', 'broken') AND reference = 'refs/heads/main')
|
||||
OR retries_status_change::boolean
|
||||
)
|
||||
AND (status::text IN ('"failed"', '"broken"') OR retries_status_change::boolean)
|
||||
;
|
||||
"""
|
||||
|
||||
@@ -41,9 +40,6 @@ def main(args: argparse.Namespace):
|
||||
interval_days = args.days
|
||||
output = args.output
|
||||
|
||||
build_type = args.build_type
|
||||
pg_version = args.pg_version
|
||||
|
||||
res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]]
|
||||
res = defaultdict(lambda: defaultdict(dict))
|
||||
|
||||
@@ -59,21 +55,8 @@ def main(args: argparse.Namespace):
|
||||
rows = []
|
||||
|
||||
for row in rows:
|
||||
# We don't want to automatically rerun tests in a performance suite
|
||||
if row["parent_suite"] != "test_runner.regress":
|
||||
continue
|
||||
|
||||
deparametrized_test = row["deparametrized_test"]
|
||||
dash_if_needed = "" if deparametrized_test.endswith("[]") else "-"
|
||||
parametrized_test = deparametrized_test.replace(
|
||||
"[",
|
||||
f"[{build_type}-pg{pg_version}{dash_if_needed}",
|
||||
)
|
||||
res[row["parent_suite"]][row["suite"]][parametrized_test] = True
|
||||
|
||||
logging.info(
|
||||
f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{parametrized_test}"
|
||||
)
|
||||
logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}")
|
||||
res[row["parent_suite"]][row["suite"]][row["test"]] = True
|
||||
|
||||
logging.info(f"saving results to {output.name}")
|
||||
json.dump(res, output, indent=2)
|
||||
@@ -94,18 +77,6 @@ if __name__ == "__main__":
|
||||
type=int,
|
||||
help="how many days to look back for flaky tests (default: 10)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--build-type",
|
||||
required=True,
|
||||
type=str,
|
||||
help="for which build type to create list of flaky tests (debug or release)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pg-version",
|
||||
required=True,
|
||||
type=int,
|
||||
help="for which Postgres version to create list of flaky tests (14, 15, etc.)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"connstr",
|
||||
help="connection string to the test results database",
|
||||
|
||||
@@ -125,7 +125,6 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
|
||||
tenant_id: vec![0xFF; 16],
|
||||
timeline_id: tli_from_u64(counter % n_keys),
|
||||
}),
|
||||
term: 0,
|
||||
last_log_term: 0,
|
||||
flush_lsn: counter,
|
||||
commit_lsn: 2,
|
||||
@@ -133,7 +132,6 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
|
||||
remote_consistent_lsn: 4,
|
||||
peer_horizon_lsn: 5,
|
||||
safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(),
|
||||
http_connstr: "zenith-1-sk-1.local:7677".to_owned(),
|
||||
local_start_lsn: 0,
|
||||
availability_zone: None,
|
||||
};
|
||||
|
||||
@@ -22,8 +22,6 @@ message SubscribeSafekeeperInfoRequest {
|
||||
message SafekeeperTimelineInfo {
|
||||
uint64 safekeeper_id = 1;
|
||||
TenantTimelineId tenant_timeline_id = 2;
|
||||
// Safekeeper term
|
||||
uint64 term = 12;
|
||||
// Term of the last entry.
|
||||
uint64 last_log_term = 3;
|
||||
// LSN of the last record.
|
||||
@@ -38,8 +36,6 @@ message SafekeeperTimelineInfo {
|
||||
uint64 local_start_lsn = 9;
|
||||
// A connection string to use for WAL receiving.
|
||||
string safekeeper_connstr = 10;
|
||||
// HTTP endpoint connection string
|
||||
string http_connstr = 13;
|
||||
// Availability zone of a safekeeper.
|
||||
optional string availability_zone = 11;
|
||||
}
|
||||
|
||||
@@ -519,7 +519,6 @@ mod tests {
|
||||
tenant_id: vec![0x00; 16],
|
||||
timeline_id,
|
||||
}),
|
||||
term: 0,
|
||||
last_log_term: 0,
|
||||
flush_lsn: 1,
|
||||
commit_lsn: 2,
|
||||
@@ -527,7 +526,6 @@ mod tests {
|
||||
remote_consistent_lsn: 4,
|
||||
peer_horizon_lsn: 5,
|
||||
safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(),
|
||||
http_connstr: "neon-1-sk-1.local:7677".to_owned(),
|
||||
local_start_lsn: 0,
|
||||
availability_zone: None,
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user