mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-13 23:50:36 +00:00
Compare commits
103 Commits
partitioni
...
relsize_ca
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
12bac9c12b | ||
|
|
9a7427c203 | ||
|
|
968c20ca5f | ||
|
|
f8a64512df | ||
|
|
07acd6ddde | ||
|
|
2b21d7b5bc | ||
|
|
61cc562822 | ||
|
|
7c041d9939 | ||
|
|
7f048abf3b | ||
|
|
5cf94a5848 | ||
|
|
5cf597044d | ||
|
|
95452e605a | ||
|
|
21da9199fa | ||
|
|
39d86ed29e | ||
|
|
f540f115a3 | ||
|
|
0b5b2e8e0b | ||
|
|
60e5dc10e6 | ||
|
|
1f5918b36d | ||
|
|
80b7a3b51a | ||
|
|
85bda437de | ||
|
|
52f445094a | ||
|
|
bcdee3d3b5 | ||
|
|
c08fa9d562 | ||
|
|
00c26ff3a3 | ||
|
|
ec0faf3ac6 | ||
|
|
1a5af6d7a5 | ||
|
|
520ffb341b | ||
|
|
9f2b40645d | ||
|
|
168214e0b6 | ||
|
|
d9d4ef12c3 | ||
|
|
e1e24336b7 | ||
|
|
4c54e4b37d | ||
|
|
ae116ff0a9 | ||
|
|
e6ea049165 | ||
|
|
747d009bb4 | ||
|
|
cb5df3c627 | ||
|
|
0e3456351f | ||
|
|
1faf49da0f | ||
|
|
4a96259bdd | ||
|
|
242af75653 | ||
|
|
8fabdc6708 | ||
|
|
07df7c2edd | ||
|
|
50821c0a3c | ||
|
|
68adfe0fc8 | ||
|
|
cfdf79aceb | ||
|
|
32560e75d2 | ||
|
|
bb69e0920c | ||
|
|
05f6a1394d | ||
|
|
844832ffe4 | ||
|
|
d29c545b5d | ||
|
|
6abdb12724 | ||
|
|
7898e72990 | ||
|
|
65704708fa | ||
|
|
6100a02d0f | ||
|
|
97fed38213 | ||
|
|
cadaca010c | ||
|
|
f09c09438a | ||
|
|
00fc696606 | ||
|
|
1d0706cf25 | ||
|
|
5ee19b0758 | ||
|
|
cef90d9220 | ||
|
|
4a05413a4c | ||
|
|
dd61f3558f | ||
|
|
8a714f1ebf | ||
|
|
137291dc24 | ||
|
|
eb8926083e | ||
|
|
26bca6ddba | ||
|
|
55192384c3 | ||
|
|
392cd8b1fc | ||
|
|
3cc531d093 | ||
|
|
84b9fcbbd5 | ||
|
|
93e050afe3 | ||
|
|
6d7dc384a5 | ||
|
|
3c2b03cd87 | ||
|
|
7c49abe7d1 | ||
|
|
d059e588a6 | ||
|
|
6222a0012b | ||
|
|
1ca28e6f3c | ||
|
|
6c4d6a2183 | ||
|
|
37465dafe3 | ||
|
|
ec0064c442 | ||
|
|
83c7e6ce52 | ||
|
|
f862373ac0 | ||
|
|
699f46cd84 | ||
|
|
36ee182d26 | ||
|
|
d11c9f9fcb | ||
|
|
d8a37452c8 | ||
|
|
e1336f451d | ||
|
|
a4d8261390 | ||
|
|
e2a5a31595 | ||
|
|
0ac0fba77a | ||
|
|
a001052cdd | ||
|
|
1f1d852204 | ||
|
|
f7b878611a | ||
|
|
a51b2dac9a | ||
|
|
e22d9cee3a | ||
|
|
a01999bc4a | ||
|
|
32e64afd54 | ||
|
|
8a53472e4f | ||
|
|
6e26588d17 | ||
|
|
0b93253b3c | ||
|
|
7dc6beacbd | ||
|
|
6cfebc096f |
@@ -1,18 +0,0 @@
|
|||||||
[Unit]
|
|
||||||
Description=Zenith safekeeper
|
|
||||||
After=network.target auditd.service
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
User=safekeeper
|
|
||||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
|
||||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}'
|
|
||||||
ExecReload=/bin/kill -HUP $MAINPID
|
|
||||||
KillMode=mixed
|
|
||||||
KillSignal=SIGINT
|
|
||||||
Restart=on-failure
|
|
||||||
TimeoutSec=10
|
|
||||||
LimitNOFILE=30000000
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
@@ -5,10 +5,10 @@ executors:
|
|||||||
resource_class: xlarge
|
resource_class: xlarge
|
||||||
docker:
|
docker:
|
||||||
# NB: when changed, do not forget to update rust image tag in all Dockerfiles
|
# NB: when changed, do not forget to update rust image tag in all Dockerfiles
|
||||||
- image: zimg/rust:1.58
|
- image: neondatabase/rust:1.58
|
||||||
neon-executor:
|
neon-executor:
|
||||||
docker:
|
docker:
|
||||||
- image: zimg/rust:1.58
|
- image: neondatabase/rust:1.58
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
# A job to build postgres
|
# A job to build postgres
|
||||||
@@ -37,7 +37,7 @@ jobs:
|
|||||||
name: Restore postgres cache
|
name: Restore postgres cache
|
||||||
keys:
|
keys:
|
||||||
# Restore ONLY if the rev key matches exactly
|
# Restore ONLY if the rev key matches exactly
|
||||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
- v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||||
|
|
||||||
# Build postgres if the restore_cache didn't find a build.
|
# Build postgres if the restore_cache didn't find a build.
|
||||||
# `make` can't figure out whether the cache is valid, since
|
# `make` can't figure out whether the cache is valid, since
|
||||||
@@ -54,7 +54,7 @@ jobs:
|
|||||||
|
|
||||||
- save_cache:
|
- save_cache:
|
||||||
name: Save postgres cache
|
name: Save postgres cache
|
||||||
key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
key: v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||||
paths:
|
paths:
|
||||||
- tmp_install
|
- tmp_install
|
||||||
|
|
||||||
@@ -85,7 +85,7 @@ jobs:
|
|||||||
name: Restore postgres cache
|
name: Restore postgres cache
|
||||||
keys:
|
keys:
|
||||||
# Restore ONLY if the rev key matches exactly
|
# Restore ONLY if the rev key matches exactly
|
||||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
- v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||||
|
|
||||||
- restore_cache:
|
- restore_cache:
|
||||||
name: Restore rust cache
|
name: Restore rust cache
|
||||||
@@ -93,31 +93,29 @@ jobs:
|
|||||||
# Require an exact match. While an out of date cache might speed up the build,
|
# Require an exact match. While an out of date cache might speed up the build,
|
||||||
# there's no way to clean out old packages, so the cache grows every time something
|
# there's no way to clean out old packages, so the cache grows every time something
|
||||||
# changes.
|
# changes.
|
||||||
- v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
- v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||||
|
|
||||||
# Build the rust code, including test binaries
|
# Build the rust code, including test binaries
|
||||||
- run:
|
- run:
|
||||||
name: Rust build << parameters.build_type >>
|
name: Rust build << parameters.build_type >>
|
||||||
command: |
|
command: |
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
|
||||||
CARGO_FLAGS=
|
CARGO_FLAGS=
|
||||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||||
cov_prefix=()
|
|
||||||
CARGO_FLAGS="--release --features profiling"
|
CARGO_FLAGS="--release --features profiling"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
export CARGO_INCREMENTAL=0
|
export CARGO_INCREMENTAL=0
|
||||||
export CACHEPOT_BUCKET=zenith-rust-cachepot
|
export CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||||
export RUSTC_WRAPPER=cachepot
|
export RUSTC_WRAPPER=""
|
||||||
export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
|
export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
|
||||||
export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
|
export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
|
||||||
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||||
cachepot -s
|
cachepot -s
|
||||||
|
|
||||||
- save_cache:
|
- save_cache:
|
||||||
name: Save rust cache
|
name: Save rust cache
|
||||||
key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
key: v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||||
paths:
|
paths:
|
||||||
- ~/.cargo/registry
|
- ~/.cargo/registry
|
||||||
- ~/.cargo/git
|
- ~/.cargo/git
|
||||||
@@ -128,35 +126,22 @@ jobs:
|
|||||||
name: cargo test
|
name: cargo test
|
||||||
command: |
|
command: |
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
|
||||||
CARGO_FLAGS=
|
CARGO_FLAGS=
|
||||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||||
cov_prefix=()
|
|
||||||
CARGO_FLAGS=--release
|
CARGO_FLAGS=--release
|
||||||
fi
|
fi
|
||||||
|
|
||||||
"${cov_prefix[@]}" cargo test $CARGO_FLAGS
|
cargo test $CARGO_FLAGS
|
||||||
|
|
||||||
# Install the rust binaries, for use by test jobs
|
# Install the rust binaries, for use by test jobs
|
||||||
- run:
|
- run:
|
||||||
name: Install rust binaries
|
name: Install rust binaries
|
||||||
command: |
|
command: |
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
|
||||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
|
||||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
|
||||||
cov_prefix=()
|
|
||||||
fi
|
|
||||||
|
|
||||||
binaries=$(
|
binaries=$(
|
||||||
"${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
|
cargo metadata --format-version=1 --no-deps |
|
||||||
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
|
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
|
||||||
)
|
)
|
||||||
|
|
||||||
test_exe_paths=$(
|
|
||||||
"${cov_prefix[@]}" cargo test --message-format=json --no-run |
|
|
||||||
jq -r '.executable | select(. != null)'
|
|
||||||
)
|
|
||||||
|
|
||||||
mkdir -p /tmp/zenith/bin
|
mkdir -p /tmp/zenith/bin
|
||||||
mkdir -p /tmp/zenith/test_bin
|
mkdir -p /tmp/zenith/test_bin
|
||||||
mkdir -p /tmp/zenith/etc
|
mkdir -p /tmp/zenith/etc
|
||||||
@@ -166,34 +151,15 @@ jobs:
|
|||||||
SRC=target/$BUILD_TYPE/$bin
|
SRC=target/$BUILD_TYPE/$bin
|
||||||
DST=/tmp/zenith/bin/$bin
|
DST=/tmp/zenith/bin/$bin
|
||||||
cp $SRC $DST
|
cp $SRC $DST
|
||||||
echo $DST >> /tmp/zenith/etc/binaries.list
|
|
||||||
done
|
done
|
||||||
|
|
||||||
# Install test executables (for code coverage)
|
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
|
||||||
for bin in $test_exe_paths; do
|
|
||||||
SRC=$bin
|
|
||||||
DST=/tmp/zenith/test_bin/$(basename $bin)
|
|
||||||
cp $SRC $DST
|
|
||||||
echo $DST >> /tmp/zenith/etc/binaries.list
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install the postgres binaries, for use by test jobs
|
# Install the postgres binaries, for use by test jobs
|
||||||
- run:
|
- run:
|
||||||
name: Install postgres binaries
|
name: Install postgres binaries
|
||||||
command: |
|
command: |
|
||||||
cp -a tmp_install /tmp/zenith/pg_install
|
cp -a tmp_install /tmp/zenith/pg_install
|
||||||
|
|
||||||
- run:
|
# Save rust binaries for other jobs in the workflow
|
||||||
name: Merge coverage data
|
|
||||||
command: |
|
|
||||||
# This will speed up workspace uploads
|
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
|
||||||
scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Save the rust binaries and coverage data for other jobs in this workflow.
|
|
||||||
- persist_to_workspace:
|
- persist_to_workspace:
|
||||||
root: /tmp/zenith
|
root: /tmp/zenith
|
||||||
paths:
|
paths:
|
||||||
@@ -286,7 +252,7 @@ jobs:
|
|||||||
# no_output_timeout, specified here.
|
# no_output_timeout, specified here.
|
||||||
no_output_timeout: 10m
|
no_output_timeout: 10m
|
||||||
environment:
|
environment:
|
||||||
- ZENITH_BIN: /tmp/zenith/bin
|
- NEON_BIN: /tmp/zenith/bin
|
||||||
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
|
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
|
||||||
- TEST_OUTPUT: /tmp/test_output
|
- TEST_OUTPUT: /tmp/test_output
|
||||||
# this variable will be embedded in perf test report
|
# this variable will be embedded in perf test report
|
||||||
@@ -314,12 +280,6 @@ jobs:
|
|||||||
|
|
||||||
export GITHUB_SHA=$CIRCLE_SHA1
|
export GITHUB_SHA=$CIRCLE_SHA1
|
||||||
|
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
|
||||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
|
||||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
|
||||||
cov_prefix=()
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Run the tests.
|
# Run the tests.
|
||||||
#
|
#
|
||||||
# The junit.xml file allows CircleCI to display more fine-grained test information
|
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||||
@@ -330,7 +290,7 @@ jobs:
|
|||||||
# -n4 uses four processes to run tests via pytest-xdist
|
# -n4 uses four processes to run tests via pytest-xdist
|
||||||
# -s is not used to prevent pytest from capturing output, because tests are running
|
# -s is not used to prevent pytest from capturing output, because tests are running
|
||||||
# in parallel and logs are mixed between different tests
|
# in parallel and logs are mixed between different tests
|
||||||
"${cov_prefix[@]}" ./scripts/pytest \
|
./scripts/pytest \
|
||||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||||
--tb=short \
|
--tb=short \
|
||||||
--verbose \
|
--verbose \
|
||||||
@@ -359,379 +319,12 @@ jobs:
|
|||||||
# The store_test_results step tells CircleCI where to find the junit.xml file.
|
# The store_test_results step tells CircleCI where to find the junit.xml file.
|
||||||
- store_test_results:
|
- store_test_results:
|
||||||
path: /tmp/test_output
|
path: /tmp/test_output
|
||||||
- run:
|
# Save data (if any)
|
||||||
name: Merge coverage data
|
|
||||||
command: |
|
|
||||||
# This will speed up workspace uploads
|
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
|
||||||
scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
|
|
||||||
fi
|
|
||||||
# Save coverage data (if any)
|
|
||||||
- persist_to_workspace:
|
- persist_to_workspace:
|
||||||
root: /tmp/zenith
|
root: /tmp/zenith
|
||||||
paths:
|
paths:
|
||||||
- "*"
|
- "*"
|
||||||
|
|
||||||
coverage-report:
|
|
||||||
executor: neon-xlarge-executor
|
|
||||||
steps:
|
|
||||||
- attach_workspace:
|
|
||||||
at: /tmp/zenith
|
|
||||||
- checkout
|
|
||||||
- restore_cache:
|
|
||||||
name: Restore rust cache
|
|
||||||
keys:
|
|
||||||
# Require an exact match. While an out of date cache might speed up the build,
|
|
||||||
# there's no way to clean out old packages, so the cache grows every time something
|
|
||||||
# changes.
|
|
||||||
- v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
|
|
||||||
- run:
|
|
||||||
name: Build coverage report
|
|
||||||
command: |
|
|
||||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
|
||||||
|
|
||||||
scripts/coverage \
|
|
||||||
--dir=/tmp/zenith/coverage report \
|
|
||||||
--input-objects=/tmp/zenith/etc/binaries.list \
|
|
||||||
--commit-url=$COMMIT_URL \
|
|
||||||
--format=github
|
|
||||||
- run:
|
|
||||||
name: Upload coverage report
|
|
||||||
command: |
|
|
||||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
|
||||||
REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1
|
|
||||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
|
||||||
|
|
||||||
scripts/git-upload \
|
|
||||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \
|
|
||||||
--message="Add code coverage for $COMMIT_URL" \
|
|
||||||
copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
|
|
||||||
|
|
||||||
# Add link to the coverage report to the commit
|
|
||||||
curl -f -X POST \
|
|
||||||
https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
|
|
||||||
-H "Accept: application/vnd.github.v3+json" \
|
|
||||||
--user "$CI_ACCESS_TOKEN" \
|
|
||||||
--data \
|
|
||||||
"{
|
|
||||||
\"state\": \"success\",
|
|
||||||
\"context\": \"zenith-coverage\",
|
|
||||||
\"description\": \"Coverage report is ready\",
|
|
||||||
\"target_url\": \"$REPORT_URL\"
|
|
||||||
}"
|
|
||||||
|
|
||||||
# Build neondatabase/neon:latest image and push it to Docker hub
|
|
||||||
docker-image:
|
|
||||||
docker:
|
|
||||||
- image: cimg/base:2021.04
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- setup_remote_docker:
|
|
||||||
docker_layer_caching: true
|
|
||||||
- run:
|
|
||||||
name: Init postgres submodule
|
|
||||||
command: git submodule update --init --depth 1
|
|
||||||
- run:
|
|
||||||
name: Build and push Docker image
|
|
||||||
command: |
|
|
||||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
|
||||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
|
||||||
docker build \
|
|
||||||
--pull \
|
|
||||||
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
|
|
||||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
|
||||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
|
||||||
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest .
|
|
||||||
docker push neondatabase/neon:${DOCKER_TAG}
|
|
||||||
docker push neondatabase/neon:latest
|
|
||||||
|
|
||||||
# Build neondatabase/compute-node:latest image and push it to Docker hub
|
|
||||||
docker-image-compute:
|
|
||||||
docker:
|
|
||||||
- image: cimg/base:2021.04
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- setup_remote_docker:
|
|
||||||
docker_layer_caching: true
|
|
||||||
- run:
|
|
||||||
name: Build and push compute-tools Docker image
|
|
||||||
command: |
|
|
||||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
|
||||||
docker build \
|
|
||||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
|
||||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
|
||||||
--tag neondatabase/compute-tools:local \
|
|
||||||
--tag neondatabase/compute-tools:latest \
|
|
||||||
-f Dockerfile.compute-tools .
|
|
||||||
# Only push :latest image
|
|
||||||
docker push neondatabase/compute-tools:latest
|
|
||||||
- run:
|
|
||||||
name: Init postgres submodule
|
|
||||||
command: git submodule update --init --depth 1
|
|
||||||
- run:
|
|
||||||
name: Build and push compute-node Docker image
|
|
||||||
command: |
|
|
||||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
|
||||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
|
||||||
docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
|
|
||||||
--tag neondatabase/compute-node:latest vendor/postgres \
|
|
||||||
--build-arg COMPUTE_TOOLS_TAG=local
|
|
||||||
docker push neondatabase/compute-node:${DOCKER_TAG}
|
|
||||||
docker push neondatabase/compute-node:latest
|
|
||||||
|
|
||||||
# Build production neondatabase/neon:release image and push it to Docker hub
|
|
||||||
docker-image-release:
|
|
||||||
docker:
|
|
||||||
- image: cimg/base:2021.04
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- setup_remote_docker:
|
|
||||||
docker_layer_caching: true
|
|
||||||
- run:
|
|
||||||
name: Init postgres submodule
|
|
||||||
command: git submodule update --init --depth 1
|
|
||||||
- run:
|
|
||||||
name: Build and push Docker image
|
|
||||||
command: |
|
|
||||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
|
||||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
|
||||||
docker build \
|
|
||||||
--pull \
|
|
||||||
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
|
|
||||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
|
||||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
|
||||||
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release .
|
|
||||||
docker push neondatabase/neon:${DOCKER_TAG}
|
|
||||||
docker push neondatabase/neon:release
|
|
||||||
|
|
||||||
# Build production neondatabase/compute-node:release image and push it to Docker hub
|
|
||||||
docker-image-compute-release:
|
|
||||||
docker:
|
|
||||||
- image: cimg/base:2021.04
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- setup_remote_docker:
|
|
||||||
docker_layer_caching: true
|
|
||||||
- run:
|
|
||||||
name: Build and push compute-tools Docker image
|
|
||||||
command: |
|
|
||||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
|
||||||
docker build \
|
|
||||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
|
||||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
|
||||||
--tag neondatabase/compute-tools:release \
|
|
||||||
--tag neondatabase/compute-tools:local \
|
|
||||||
-f Dockerfile.compute-tools .
|
|
||||||
# Only push :release image
|
|
||||||
docker push neondatabase/compute-tools:release
|
|
||||||
- run:
|
|
||||||
name: Init postgres submodule
|
|
||||||
command: git submodule update --init --depth 1
|
|
||||||
- run:
|
|
||||||
name: Build and push compute-node Docker image
|
|
||||||
command: |
|
|
||||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
|
||||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
|
||||||
docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
|
|
||||||
--tag neondatabase/compute-node:release vendor/postgres \
|
|
||||||
--build-arg COMPUTE_TOOLS_TAG=local
|
|
||||||
docker push neondatabase/compute-node:${DOCKER_TAG}
|
|
||||||
docker push neondatabase/compute-node:release
|
|
||||||
|
|
||||||
deploy-staging:
|
|
||||||
docker:
|
|
||||||
- image: cimg/python:3.10
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- setup_remote_docker
|
|
||||||
- run:
|
|
||||||
name: Setup ansible
|
|
||||||
command: |
|
|
||||||
pip install --progress-bar off --user ansible boto3
|
|
||||||
- run:
|
|
||||||
name: Redeploy
|
|
||||||
command: |
|
|
||||||
cd "$(pwd)/.circleci/ansible"
|
|
||||||
|
|
||||||
./get_binaries.sh
|
|
||||||
|
|
||||||
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
|
|
||||||
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
|
||||||
chmod 0600 ssh-key
|
|
||||||
ssh-add ssh-key
|
|
||||||
rm -f ssh-key ssh-key-cert.pub
|
|
||||||
|
|
||||||
ansible-playbook deploy.yaml -i staging.hosts
|
|
||||||
rm -f neon_install.tar.gz .neon_current_version
|
|
||||||
|
|
||||||
deploy-staging-proxy:
|
|
||||||
docker:
|
|
||||||
- image: cimg/base:2021.04
|
|
||||||
environment:
|
|
||||||
KUBECONFIG: .kubeconfig
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- run:
|
|
||||||
name: Store kubeconfig file
|
|
||||||
command: |
|
|
||||||
echo "${STAGING_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
|
|
||||||
chmod 0600 ${KUBECONFIG}
|
|
||||||
- run:
|
|
||||||
name: Setup helm v3
|
|
||||||
command: |
|
|
||||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
|
||||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
|
||||||
- run:
|
|
||||||
name: Re-deploy proxy
|
|
||||||
command: |
|
|
||||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
|
||||||
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
|
||||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
|
||||||
|
|
||||||
deploy-neon-stress:
|
|
||||||
docker:
|
|
||||||
- image: cimg/python:3.10
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- setup_remote_docker
|
|
||||||
- run:
|
|
||||||
name: Setup ansible
|
|
||||||
command: |
|
|
||||||
pip install --progress-bar off --user ansible boto3
|
|
||||||
- run:
|
|
||||||
name: Redeploy
|
|
||||||
command: |
|
|
||||||
cd "$(pwd)/.circleci/ansible"
|
|
||||||
|
|
||||||
./get_binaries.sh
|
|
||||||
|
|
||||||
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
|
|
||||||
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
|
||||||
chmod 0600 ssh-key
|
|
||||||
ssh-add ssh-key
|
|
||||||
rm -f ssh-key ssh-key-cert.pub
|
|
||||||
|
|
||||||
ansible-playbook deploy.yaml -i neon-stress.hosts
|
|
||||||
rm -f neon_install.tar.gz .neon_current_version
|
|
||||||
|
|
||||||
deploy-neon-stress-proxy:
|
|
||||||
docker:
|
|
||||||
- image: cimg/base:2021.04
|
|
||||||
environment:
|
|
||||||
KUBECONFIG: .kubeconfig
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- run:
|
|
||||||
name: Store kubeconfig file
|
|
||||||
command: |
|
|
||||||
echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
|
|
||||||
chmod 0600 ${KUBECONFIG}
|
|
||||||
- run:
|
|
||||||
name: Setup helm v3
|
|
||||||
command: |
|
|
||||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
|
||||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
|
||||||
- run:
|
|
||||||
name: Re-deploy proxy
|
|
||||||
command: |
|
|
||||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
|
||||||
helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
|
||||||
helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
|
||||||
|
|
||||||
deploy-release:
|
|
||||||
docker:
|
|
||||||
- image: cimg/python:3.10
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- setup_remote_docker
|
|
||||||
- run:
|
|
||||||
name: Setup ansible
|
|
||||||
command: |
|
|
||||||
pip install --progress-bar off --user ansible boto3
|
|
||||||
- run:
|
|
||||||
name: Redeploy
|
|
||||||
command: |
|
|
||||||
cd "$(pwd)/.circleci/ansible"
|
|
||||||
|
|
||||||
RELEASE=true ./get_binaries.sh
|
|
||||||
|
|
||||||
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
|
|
||||||
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
|
||||||
chmod 0600 ssh-key
|
|
||||||
ssh-add ssh-key
|
|
||||||
rm -f ssh-key ssh-key-cert.pub
|
|
||||||
|
|
||||||
ansible-playbook deploy.yaml -i production.hosts
|
|
||||||
rm -f neon_install.tar.gz .neon_current_version
|
|
||||||
|
|
||||||
deploy-release-proxy:
|
|
||||||
docker:
|
|
||||||
- image: cimg/base:2021.04
|
|
||||||
environment:
|
|
||||||
KUBECONFIG: .kubeconfig
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- run:
|
|
||||||
name: Store kubeconfig file
|
|
||||||
command: |
|
|
||||||
echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
|
|
||||||
chmod 0600 ${KUBECONFIG}
|
|
||||||
- run:
|
|
||||||
name: Setup helm v3
|
|
||||||
command: |
|
|
||||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
|
||||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
|
||||||
- run:
|
|
||||||
name: Re-deploy proxy
|
|
||||||
command: |
|
|
||||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
|
||||||
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
|
||||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
|
||||||
|
|
||||||
# Trigger a new remote CI job
|
|
||||||
remote-ci-trigger:
|
|
||||||
docker:
|
|
||||||
- image: cimg/base:2021.04
|
|
||||||
parameters:
|
|
||||||
remote_repo:
|
|
||||||
type: string
|
|
||||||
environment:
|
|
||||||
REMOTE_REPO: << parameters.remote_repo >>
|
|
||||||
steps:
|
|
||||||
- run:
|
|
||||||
name: Set PR's status to pending
|
|
||||||
command: |
|
|
||||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
|
||||||
|
|
||||||
curl -f -X POST \
|
|
||||||
https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
|
|
||||||
-H "Accept: application/vnd.github.v3+json" \
|
|
||||||
--user "$CI_ACCESS_TOKEN" \
|
|
||||||
--data \
|
|
||||||
"{
|
|
||||||
\"state\": \"pending\",
|
|
||||||
\"context\": \"neon-cloud-e2e\",
|
|
||||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
|
||||||
}"
|
|
||||||
- run:
|
|
||||||
name: Request a remote CI test
|
|
||||||
command: |
|
|
||||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
|
||||||
|
|
||||||
curl -f -X POST \
|
|
||||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
|
||||||
-H "Accept: application/vnd.github.v3+json" \
|
|
||||||
--user "$CI_ACCESS_TOKEN" \
|
|
||||||
--data \
|
|
||||||
"{
|
|
||||||
\"ref\": \"main\",
|
|
||||||
\"inputs\": {
|
|
||||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
|
||||||
\"commit_hash\": \"$CIRCLE_SHA1\",
|
|
||||||
\"remote_repo\": \"$LOCAL_REPO\"
|
|
||||||
}
|
|
||||||
}"
|
|
||||||
|
|
||||||
workflows:
|
workflows:
|
||||||
build_and_test:
|
build_and_test:
|
||||||
jobs:
|
jobs:
|
||||||
@@ -774,120 +367,3 @@ workflows:
|
|||||||
save_perf_report: true
|
save_perf_report: true
|
||||||
requires:
|
requires:
|
||||||
- build-neon-release
|
- build-neon-release
|
||||||
- coverage-report:
|
|
||||||
# Context passes credentials for gh api
|
|
||||||
context: CI_ACCESS_TOKEN
|
|
||||||
requires:
|
|
||||||
# TODO: consider adding more
|
|
||||||
- other-tests-debug
|
|
||||||
- docker-image:
|
|
||||||
# Context gives an ability to login
|
|
||||||
context: Docker Hub
|
|
||||||
# Build image only for commits to main
|
|
||||||
filters:
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- main
|
|
||||||
requires:
|
|
||||||
- pg_regress-tests-release
|
|
||||||
- other-tests-release
|
|
||||||
- docker-image-compute:
|
|
||||||
# Context gives an ability to login
|
|
||||||
context: Docker Hub
|
|
||||||
# Build image only for commits to main
|
|
||||||
filters:
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- main
|
|
||||||
requires:
|
|
||||||
- pg_regress-tests-release
|
|
||||||
- other-tests-release
|
|
||||||
- deploy-staging:
|
|
||||||
# Context gives an ability to login
|
|
||||||
context: Docker Hub
|
|
||||||
# deploy only for commits to main
|
|
||||||
filters:
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- main
|
|
||||||
requires:
|
|
||||||
- docker-image
|
|
||||||
- deploy-staging-proxy:
|
|
||||||
# deploy only for commits to main
|
|
||||||
filters:
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- main
|
|
||||||
requires:
|
|
||||||
- docker-image
|
|
||||||
|
|
||||||
- deploy-neon-stress:
|
|
||||||
# Context gives an ability to login
|
|
||||||
context: Docker Hub
|
|
||||||
# deploy only for commits to main
|
|
||||||
filters:
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- main
|
|
||||||
requires:
|
|
||||||
- docker-image
|
|
||||||
- deploy-neon-stress-proxy:
|
|
||||||
# deploy only for commits to main
|
|
||||||
filters:
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- main
|
|
||||||
requires:
|
|
||||||
- docker-image
|
|
||||||
|
|
||||||
- docker-image-release:
|
|
||||||
# Context gives an ability to login
|
|
||||||
context: Docker Hub
|
|
||||||
# Build image only for commits to main
|
|
||||||
filters:
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- release
|
|
||||||
requires:
|
|
||||||
- pg_regress-tests-release
|
|
||||||
- other-tests-release
|
|
||||||
- docker-image-compute-release:
|
|
||||||
# Context gives an ability to login
|
|
||||||
context: Docker Hub
|
|
||||||
# Build image only for commits to main
|
|
||||||
filters:
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- release
|
|
||||||
requires:
|
|
||||||
- pg_regress-tests-release
|
|
||||||
- other-tests-release
|
|
||||||
- deploy-release:
|
|
||||||
# Context gives an ability to login
|
|
||||||
context: Docker Hub
|
|
||||||
# deploy only for commits to main
|
|
||||||
filters:
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- release
|
|
||||||
requires:
|
|
||||||
- docker-image-release
|
|
||||||
- deploy-release-proxy:
|
|
||||||
# deploy only for commits to main
|
|
||||||
filters:
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- release
|
|
||||||
requires:
|
|
||||||
- docker-image-release
|
|
||||||
- remote-ci-trigger:
|
|
||||||
# Context passes credentials for gh api
|
|
||||||
context: CI_ACCESS_TOKEN
|
|
||||||
remote_repo: "neondatabase/cloud"
|
|
||||||
requires:
|
|
||||||
# XXX: Successful build doesn't mean everything is OK, but
|
|
||||||
# the job to be triggered takes so much time to complete (~22 min)
|
|
||||||
# that it's better not to wait for the commented-out steps
|
|
||||||
- build-neon-release
|
|
||||||
# - pg_regress-tests-release
|
|
||||||
# - other-tests-release
|
|
||||||
|
|||||||
@@ -9,8 +9,8 @@ tmp_install
|
|||||||
tmp_check_cli
|
tmp_check_cli
|
||||||
test_output
|
test_output
|
||||||
.vscode
|
.vscode
|
||||||
.zenith
|
.neon
|
||||||
integration_tests/.zenith
|
integration_tests/.neon
|
||||||
.mypy_cache
|
.mypy_cache
|
||||||
|
|
||||||
Dockerfile
|
Dockerfile
|
||||||
|
|||||||
140
.github/actions/run-python-test-set/action.yml
vendored
Normal file
140
.github/actions/run-python-test-set/action.yml
vendored
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
name: 'Run python test'
|
||||||
|
description: 'Runs a Neon python test set, performing all the required preparations before'
|
||||||
|
|
||||||
|
inputs:
|
||||||
|
build_type:
|
||||||
|
description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".'
|
||||||
|
required: true
|
||||||
|
rust_toolchain:
|
||||||
|
description: 'Rust toolchain version to fetch the caches'
|
||||||
|
required: true
|
||||||
|
test_selection:
|
||||||
|
description: 'A python test suite to run'
|
||||||
|
required: true
|
||||||
|
extra_params:
|
||||||
|
description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr'
|
||||||
|
required: false
|
||||||
|
default: ''
|
||||||
|
needs_postgres_source:
|
||||||
|
description: 'Set to true if the test suite requires postgres source checked out'
|
||||||
|
required: false
|
||||||
|
default: 'false'
|
||||||
|
run_in_parallel:
|
||||||
|
description: 'Whether to run tests in parallel'
|
||||||
|
required: false
|
||||||
|
default: 'true'
|
||||||
|
save_perf_report:
|
||||||
|
description: 'Whether to upload the performance report'
|
||||||
|
required: false
|
||||||
|
default: 'false'
|
||||||
|
|
||||||
|
runs:
|
||||||
|
using: "composite"
|
||||||
|
steps:
|
||||||
|
- name: Get Neon artifact for restoration
|
||||||
|
uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact
|
||||||
|
path: ./neon-artifact/
|
||||||
|
|
||||||
|
- name: Extract Neon artifact
|
||||||
|
shell: bash -ex {0}
|
||||||
|
run: |
|
||||||
|
mkdir -p /tmp/neon/
|
||||||
|
tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
|
||||||
|
rm -rf ./neon-artifact/
|
||||||
|
|
||||||
|
- name: Checkout
|
||||||
|
if: inputs.needs_postgres_source == 'true'
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Cache poetry deps
|
||||||
|
id: cache_poetry
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: ~/.cache/pypoetry/virtualenvs
|
||||||
|
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||||
|
|
||||||
|
- name: Install Python deps
|
||||||
|
shell: bash -ex {0}
|
||||||
|
run: ./scripts/pysync
|
||||||
|
|
||||||
|
- name: Run pytest
|
||||||
|
env:
|
||||||
|
NEON_BIN: /tmp/neon/bin
|
||||||
|
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||||
|
TEST_OUTPUT: /tmp/test_output
|
||||||
|
# this variable will be embedded in perf test report
|
||||||
|
# and is needed to distinguish different environments
|
||||||
|
PLATFORM: github-actions-selfhosted
|
||||||
|
shell: bash -ex {0}
|
||||||
|
run: |
|
||||||
|
PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
|
||||||
|
rm -rf $PERF_REPORT_DIR
|
||||||
|
|
||||||
|
TEST_SELECTION="test_runner/${{ inputs.test_selection }}"
|
||||||
|
EXTRA_PARAMS="${{ inputs.extra_params }}"
|
||||||
|
if [ -z "$TEST_SELECTION" ]; then
|
||||||
|
echo "test_selection must be set"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
|
||||||
|
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
|
||||||
|
fi
|
||||||
|
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
|
||||||
|
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
|
||||||
|
mkdir -p "$PERF_REPORT_DIR"
|
||||||
|
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||||
|
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||||
|
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||||
|
cov_prefix=()
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run the tests.
|
||||||
|
#
|
||||||
|
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||||
|
# in its "Tests" tab in the results page.
|
||||||
|
# --verbose prints name of each test (helpful when there are
|
||||||
|
# multiple tests in one file)
|
||||||
|
# -rA prints summary in the end
|
||||||
|
# -n4 uses four processes to run tests via pytest-xdist
|
||||||
|
# -s is not used to prevent pytest from capturing output, because tests are running
|
||||||
|
# in parallel and logs are mixed between different tests
|
||||||
|
"${cov_prefix[@]}" ./scripts/pytest \
|
||||||
|
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||||
|
--tb=short \
|
||||||
|
--verbose \
|
||||||
|
-m "not remote_cluster" \
|
||||||
|
-rA $TEST_SELECTION $EXTRA_PARAMS
|
||||||
|
|
||||||
|
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
|
||||||
|
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
|
||||||
|
export REPORT_FROM="$PERF_REPORT_DIR"
|
||||||
|
export REPORT_TO=local
|
||||||
|
scripts/generate_and_push_perf_report.sh
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Delete all data but logs
|
||||||
|
shell: bash -ex {0}
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
du -sh /tmp/test_output/*
|
||||||
|
find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
|
||||||
|
du -sh /tmp/test_output/*
|
||||||
|
|
||||||
|
- name: Upload python test logs
|
||||||
|
if: always()
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
retention-days: 7
|
||||||
|
if-no-files-found: error
|
||||||
|
name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs
|
||||||
|
path: /tmp/test_output/
|
||||||
17
.github/actions/save-coverage-data/action.yml
vendored
Normal file
17
.github/actions/save-coverage-data/action.yml
vendored
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
name: 'Merge and upload coverage data'
|
||||||
|
description: 'Compresses and uploads the coverage data as an artifact'
|
||||||
|
|
||||||
|
runs:
|
||||||
|
using: "composite"
|
||||||
|
steps:
|
||||||
|
- name: Merge coverage data
|
||||||
|
shell: bash -ex {0}
|
||||||
|
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
|
||||||
|
|
||||||
|
- name: Upload coverage data
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
retention-days: 7
|
||||||
|
if-no-files-found: error
|
||||||
|
name: coverage-data-artifact
|
||||||
|
path: /tmp/coverage/
|
||||||
@@ -6,5 +6,7 @@ timeout = 30
|
|||||||
|
|
||||||
[ssh_connection]
|
[ssh_connection]
|
||||||
ssh_args = -F ./ansible.ssh.cfg
|
ssh_args = -F ./ansible.ssh.cfg
|
||||||
scp_if_ssh = True
|
# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127
|
||||||
|
# and scp neither worked for me
|
||||||
|
transfer_method = piped
|
||||||
pipelining = True
|
pipelining = True
|
||||||
@@ -1,3 +1,7 @@
|
|||||||
|
# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed
|
||||||
|
# (use pre 8.5 option name to cope with old ssh in CI)
|
||||||
|
PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com
|
||||||
|
|
||||||
Host tele.zenith.tech
|
Host tele.zenith.tech
|
||||||
User admin
|
User admin
|
||||||
Port 3023
|
Port 3023
|
||||||
@@ -57,7 +57,7 @@
|
|||||||
args:
|
args:
|
||||||
creates: "/storage/pageserver/data/tenants"
|
creates: "/storage/pageserver/data/tenants"
|
||||||
environment:
|
environment:
|
||||||
ZENITH_REPO_DIR: "/storage/pageserver/data"
|
NEON_REPO_DIR: "/storage/pageserver/data"
|
||||||
LD_LIBRARY_PATH: "/usr/local/lib"
|
LD_LIBRARY_PATH: "/usr/local/lib"
|
||||||
become: true
|
become: true
|
||||||
tags:
|
tags:
|
||||||
@@ -131,7 +131,7 @@
|
|||||||
args:
|
args:
|
||||||
creates: "/storage/safekeeper/data/safekeeper.id"
|
creates: "/storage/safekeeper/data/safekeeper.id"
|
||||||
environment:
|
environment:
|
||||||
ZENITH_REPO_DIR: "/storage/safekeeper/data"
|
NEON_REPO_DIR: "/storage/safekeeper/data"
|
||||||
LD_LIBRARY_PATH: "/usr/local/lib"
|
LD_LIBRARY_PATH: "/usr/local/lib"
|
||||||
become: true
|
become: true
|
||||||
tags:
|
tags:
|
||||||
@@ -12,6 +12,7 @@ pageservers
|
|||||||
safekeepers
|
safekeepers
|
||||||
|
|
||||||
[storage:vars]
|
[storage:vars]
|
||||||
|
env_name = neon-stress
|
||||||
console_mgmt_base_url = http://neon-stress-console.local
|
console_mgmt_base_url = http://neon-stress-console.local
|
||||||
bucket_name = neon-storage-ireland
|
bucket_name = neon-storage-ireland
|
||||||
bucket_region = eu-west-1
|
bucket_region = eu-west-1
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
[pageservers]
|
[pageservers]
|
||||||
#zenith-1-ps-1 console_region_id=1
|
#zenith-1-ps-1 console_region_id=1
|
||||||
zenith-1-ps-2 console_region_id=1
|
zenith-1-ps-2 console_region_id=1
|
||||||
|
zenith-1-ps-3 console_region_id=1
|
||||||
|
|
||||||
[safekeepers]
|
[safekeepers]
|
||||||
zenith-1-sk-1 console_region_id=1
|
zenith-1-sk-1 console_region_id=1
|
||||||
@@ -12,6 +13,7 @@ pageservers
|
|||||||
safekeepers
|
safekeepers
|
||||||
|
|
||||||
[storage:vars]
|
[storage:vars]
|
||||||
|
env_name = prod-1
|
||||||
console_mgmt_base_url = http://console-release.local
|
console_mgmt_base_url = http://console-release.local
|
||||||
bucket_name = zenith-storage-oregon
|
bucket_name = zenith-storage-oregon
|
||||||
bucket_region = us-west-2
|
bucket_region = us-west-2
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
[pageservers]
|
[pageservers]
|
||||||
#zenith-us-stage-ps-1 console_region_id=27
|
#zenith-us-stage-ps-1 console_region_id=27
|
||||||
zenith-us-stage-ps-2 console_region_id=27
|
zenith-us-stage-ps-2 console_region_id=27
|
||||||
|
zenith-us-stage-ps-3 console_region_id=27
|
||||||
|
|
||||||
[safekeepers]
|
[safekeepers]
|
||||||
zenith-us-stage-sk-4 console_region_id=27
|
zenith-us-stage-sk-4 console_region_id=27
|
||||||
@@ -12,6 +13,7 @@ pageservers
|
|||||||
safekeepers
|
safekeepers
|
||||||
|
|
||||||
[storage:vars]
|
[storage:vars]
|
||||||
|
env_name = us-stage
|
||||||
console_mgmt_base_url = http://console-staging.local
|
console_mgmt_base_url = http://console-staging.local
|
||||||
bucket_name = zenith-staging-storage-us-east-1
|
bucket_name = zenith-staging-storage-us-east-1
|
||||||
bucket_region = us-east-1
|
bucket_region = us-east-1
|
||||||
@@ -5,7 +5,7 @@ After=network.target auditd.service
|
|||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
User=pageserver
|
User=pageserver
|
||||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
|
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
|
||||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
|
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
|
||||||
ExecReload=/bin/kill -HUP $MAINPID
|
ExecReload=/bin/kill -HUP $MAINPID
|
||||||
KillMode=mixed
|
KillMode=mixed
|
||||||
18
.github/ansible/systemd/safekeeper.service
vendored
Normal file
18
.github/ansible/systemd/safekeeper.service
vendored
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Zenith safekeeper
|
||||||
|
After=network.target auditd.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=safekeeper
|
||||||
|
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
||||||
|
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
|
||||||
|
ExecReload=/bin/kill -HUP $MAINPID
|
||||||
|
KillMode=mixed
|
||||||
|
KillSignal=SIGINT
|
||||||
|
Restart=on-failure
|
||||||
|
TimeoutSec=10
|
||||||
|
LimitNOFILE=30000000
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
8
.github/workflows/benchmarking.yml
vendored
8
.github/workflows/benchmarking.yml
vendored
@@ -26,11 +26,11 @@ jobs:
|
|||||||
runs-on: [self-hosted, zenith-benchmarker]
|
runs-on: [self-hosted, zenith-benchmarker]
|
||||||
|
|
||||||
env:
|
env:
|
||||||
POSTGRES_DISTRIB_DIR: "/usr/pgsql-13"
|
POSTGRES_DISTRIB_DIR: "/usr/pgsql-14"
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout zenith repo
|
- name: Checkout zenith repo
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
# actions/setup-python@v2 is not working correctly on self-hosted runners
|
# actions/setup-python@v2 is not working correctly on self-hosted runners
|
||||||
# see https://github.com/actions/setup-python/issues/162
|
# see https://github.com/actions/setup-python/issues/162
|
||||||
@@ -88,7 +88,7 @@ jobs:
|
|||||||
# Plus time needed to initialize the test databases.
|
# Plus time needed to initialize the test databases.
|
||||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||||
PLATFORM: "zenith-staging"
|
PLATFORM: "neon-staging"
|
||||||
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||||
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
|
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
|
||||||
run: |
|
run: |
|
||||||
@@ -96,7 +96,7 @@ jobs:
|
|||||||
# since it might generate duplicates when calling ingest_perf_test_result.py
|
# since it might generate duplicates when calling ingest_perf_test_result.py
|
||||||
rm -rf perf-report-staging
|
rm -rf perf-report-staging
|
||||||
mkdir -p perf-report-staging
|
mkdir -p perf-report-staging
|
||||||
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging
|
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600
|
||||||
|
|
||||||
- name: Submit result
|
- name: Submit result
|
||||||
env:
|
env:
|
||||||
|
|||||||
642
.github/workflows/build_and_test.yml
vendored
Normal file
642
.github/workflows/build_and_test.yml
vendored
Normal file
@@ -0,0 +1,642 @@
|
|||||||
|
name: Test
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash -ex {0}
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
RUST_BACKTRACE: 1
|
||||||
|
COPT: '-Werror'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-postgres:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ debug, release ]
|
||||||
|
rust_toolchain: [ 1.58 ]
|
||||||
|
|
||||||
|
env:
|
||||||
|
BUILD_TYPE: ${{ matrix.build_type }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Set pg revision for caching
|
||||||
|
id: pg_ver
|
||||||
|
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
|
||||||
|
|
||||||
|
- name: Cache postgres build
|
||||||
|
id: cache_pg
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: tmp_install/
|
||||||
|
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||||
|
|
||||||
|
- name: Build postgres
|
||||||
|
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||||
|
run: mold -run make postgres -j$(nproc)
|
||||||
|
|
||||||
|
# actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache
|
||||||
|
- name: Prepare postgres artifact
|
||||||
|
run: tar -C tmp_install/ -czf ./pg.tgz .
|
||||||
|
- name: Upload postgres artifact
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
retention-days: 7
|
||||||
|
if-no-files-found: error
|
||||||
|
name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact
|
||||||
|
path: ./pg.tgz
|
||||||
|
|
||||||
|
|
||||||
|
build-neon:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ build-postgres ]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ debug, release ]
|
||||||
|
rust_toolchain: [ 1.58 ]
|
||||||
|
|
||||||
|
env:
|
||||||
|
BUILD_TYPE: ${{ matrix.build_type }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Get postgres artifact for restoration
|
||||||
|
uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact
|
||||||
|
path: ./postgres-artifact/
|
||||||
|
- name: Extract postgres artifact
|
||||||
|
run: |
|
||||||
|
mkdir ./tmp_install/
|
||||||
|
tar -xf ./postgres-artifact/pg.tgz -C ./tmp_install/
|
||||||
|
rm -rf ./postgres-artifact/
|
||||||
|
|
||||||
|
- name: Cache cargo deps
|
||||||
|
id: cache_cargo
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/.cargo/registry/
|
||||||
|
~/.cargo/git/
|
||||||
|
target/
|
||||||
|
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||||
|
key: |
|
||||||
|
v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||||
|
v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
|
||||||
|
|
||||||
|
- name: Run cargo build
|
||||||
|
run: |
|
||||||
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
|
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||||
|
CARGO_FLAGS=
|
||||||
|
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||||
|
cov_prefix=()
|
||||||
|
CARGO_FLAGS="--release --features profiling"
|
||||||
|
fi
|
||||||
|
|
||||||
|
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||||
|
|
||||||
|
- name: Run cargo test
|
||||||
|
run: |
|
||||||
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
|
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||||
|
CARGO_FLAGS=
|
||||||
|
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||||
|
cov_prefix=()
|
||||||
|
CARGO_FLAGS=--release
|
||||||
|
fi
|
||||||
|
|
||||||
|
"${cov_prefix[@]}" cargo test $CARGO_FLAGS
|
||||||
|
|
||||||
|
- name: Install rust binaries
|
||||||
|
run: |
|
||||||
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
|
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||||
|
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||||
|
cov_prefix=()
|
||||||
|
fi
|
||||||
|
|
||||||
|
binaries=$(
|
||||||
|
"${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
|
||||||
|
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
|
||||||
|
)
|
||||||
|
|
||||||
|
test_exe_paths=$(
|
||||||
|
"${cov_prefix[@]}" cargo test --message-format=json --no-run |
|
||||||
|
jq -r '.executable | select(. != null)'
|
||||||
|
)
|
||||||
|
|
||||||
|
mkdir -p /tmp/neon/bin/
|
||||||
|
mkdir -p /tmp/neon/test_bin/
|
||||||
|
mkdir -p /tmp/neon/etc/
|
||||||
|
|
||||||
|
# Keep bloated coverage data files away from the rest of the artifact
|
||||||
|
mkdir -p /tmp/coverage/
|
||||||
|
|
||||||
|
# Install target binaries
|
||||||
|
for bin in $binaries; do
|
||||||
|
SRC=target/$BUILD_TYPE/$bin
|
||||||
|
DST=/tmp/neon/bin/$bin
|
||||||
|
cp "$SRC" "$DST"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Install test executables and write list of all binaries (for code coverage)
|
||||||
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
|
for bin in $binaries; do
|
||||||
|
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
|
||||||
|
done
|
||||||
|
for bin in $test_exe_paths; do
|
||||||
|
SRC=$bin
|
||||||
|
DST=/tmp/neon/test_bin/$(basename $bin)
|
||||||
|
cp "$SRC" "$DST"
|
||||||
|
echo "$DST" >> /tmp/coverage/binaries.list
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Install postgres binaries
|
||||||
|
run: cp -a tmp_install /tmp/neon/pg_install
|
||||||
|
|
||||||
|
- name: Prepare neon artifact
|
||||||
|
run: tar -C /tmp/neon/ -czf ./neon.tgz .
|
||||||
|
|
||||||
|
- name: Upload neon binaries
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
retention-days: 7
|
||||||
|
if-no-files-found: error
|
||||||
|
name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
|
||||||
|
path: ./neon.tgz
|
||||||
|
|
||||||
|
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
|
||||||
|
- name: Merge and upload coverage data
|
||||||
|
if: matrix.build_type == 'debug'
|
||||||
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
|
|
||||||
|
pg_regress-tests:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ build-neon ]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ debug, release ]
|
||||||
|
rust_toolchain: [ 1.58 ]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 2
|
||||||
|
|
||||||
|
- name: Pytest regress tests
|
||||||
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
with:
|
||||||
|
build_type: ${{ matrix.build_type }}
|
||||||
|
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||||
|
test_selection: batch_pg_regress
|
||||||
|
needs_postgres_source: true
|
||||||
|
|
||||||
|
- name: Merge and upload coverage data
|
||||||
|
if: matrix.build_type == 'debug'
|
||||||
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
|
other-tests:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ build-neon ]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ debug, release ]
|
||||||
|
rust_toolchain: [ 1.58 ]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 2
|
||||||
|
|
||||||
|
- name: Pytest other tests
|
||||||
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
with:
|
||||||
|
build_type: ${{ matrix.build_type }}
|
||||||
|
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||||
|
test_selection: batch_others
|
||||||
|
|
||||||
|
- name: Merge and upload coverage data
|
||||||
|
if: matrix.build_type == 'debug'
|
||||||
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
|
benchmarks:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ build-neon ]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ release ]
|
||||||
|
rust_toolchain: [ 1.58 ]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 2
|
||||||
|
|
||||||
|
- name: Pytest benchmarks
|
||||||
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
with:
|
||||||
|
build_type: ${{ matrix.build_type }}
|
||||||
|
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||||
|
test_selection: performance
|
||||||
|
run_in_parallel: false
|
||||||
|
save_perf_report: true
|
||||||
|
env:
|
||||||
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
|
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||||
|
# while coverage is currently collected for the debug ones
|
||||||
|
|
||||||
|
coverage-report:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ other-tests, pg_regress-tests ]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ debug ]
|
||||||
|
rust_toolchain: [ 1.58 ]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Restore cargo deps cache
|
||||||
|
id: cache_cargo
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/.cargo/registry/
|
||||||
|
~/.cargo/git/
|
||||||
|
target/
|
||||||
|
key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||||
|
|
||||||
|
- name: Get Neon artifact for restoration
|
||||||
|
uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
|
||||||
|
path: ./neon-artifact/
|
||||||
|
|
||||||
|
- name: Extract Neon artifact
|
||||||
|
run: |
|
||||||
|
mkdir -p /tmp/neon/
|
||||||
|
tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
|
||||||
|
rm -rf ./neon-artifact/
|
||||||
|
|
||||||
|
- name: Restore coverage data
|
||||||
|
uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
name: coverage-data-artifact
|
||||||
|
path: /tmp/coverage/
|
||||||
|
|
||||||
|
- name: Merge coverage data
|
||||||
|
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
|
||||||
|
|
||||||
|
- name: Build and upload coverage report
|
||||||
|
run: |
|
||||||
|
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||||
|
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||||
|
COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA
|
||||||
|
|
||||||
|
scripts/coverage \
|
||||||
|
--dir=/tmp/coverage report \
|
||||||
|
--input-objects=/tmp/coverage/binaries.list \
|
||||||
|
--commit-url=$COMMIT_URL \
|
||||||
|
--format=github
|
||||||
|
|
||||||
|
REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA
|
||||||
|
|
||||||
|
scripts/git-upload \
|
||||||
|
--repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \
|
||||||
|
--message="Add code coverage for $COMMIT_URL" \
|
||||||
|
copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE
|
||||||
|
|
||||||
|
# Add link to the coverage report to the commit
|
||||||
|
curl -f -X POST \
|
||||||
|
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||||
|
-H "Accept: application/vnd.github.v3+json" \
|
||||||
|
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||||
|
--data \
|
||||||
|
"{
|
||||||
|
\"state\": \"success\",
|
||||||
|
\"context\": \"neon-coverage\",
|
||||||
|
\"description\": \"Coverage report is ready\",
|
||||||
|
\"target_url\": \"$REPORT_URL\"
|
||||||
|
}"
|
||||||
|
|
||||||
|
trigger-e2e-tests:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ build-neon ]
|
||||||
|
steps:
|
||||||
|
- name: Set PR's status to pending and request a remote CI test
|
||||||
|
run: |
|
||||||
|
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||||
|
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||||
|
|
||||||
|
REMOTE_REPO="${{ github.repository_owner }}/cloud"
|
||||||
|
|
||||||
|
curl -f -X POST \
|
||||||
|
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||||
|
-H "Accept: application/vnd.github.v3+json" \
|
||||||
|
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||||
|
--data \
|
||||||
|
"{
|
||||||
|
\"state\": \"pending\",
|
||||||
|
\"context\": \"neon-cloud-e2e\",
|
||||||
|
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||||
|
}"
|
||||||
|
|
||||||
|
curl -f -X POST \
|
||||||
|
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
||||||
|
-H "Accept: application/vnd.github.v3+json" \
|
||||||
|
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||||
|
--data \
|
||||||
|
"{
|
||||||
|
\"ref\": \"main\",
|
||||||
|
\"inputs\": {
|
||||||
|
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||||
|
\"commit_hash\": \"$COMMIT_SHA\",
|
||||||
|
\"remote_repo\": \"${{ github.repository }}\"
|
||||||
|
}
|
||||||
|
}"
|
||||||
|
|
||||||
|
docker-image:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ pg_regress-tests, other-tests ]
|
||||||
|
if: |
|
||||||
|
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||||
|
github.event_name != 'workflow_dispatch'
|
||||||
|
outputs:
|
||||||
|
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Login to DockerHub
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
with:
|
||||||
|
driver: docker
|
||||||
|
|
||||||
|
- name: Get build tag
|
||||||
|
run: |
|
||||||
|
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||||
|
echo "::set-output name=tag::$(git rev-list --count HEAD)"
|
||||||
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
|
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
|
||||||
|
else
|
||||||
|
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
id: build-tag
|
||||||
|
|
||||||
|
- name: Get legacy build tag
|
||||||
|
run: |
|
||||||
|
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||||
|
echo "::set-output name=tag::latest
|
||||||
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
|
echo "::set-output name=tag::release
|
||||||
|
else
|
||||||
|
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
id: legacy-build-tag
|
||||||
|
|
||||||
|
- name: Build neon Docker image
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
build-args: |
|
||||||
|
GIT_VERSION="${{github.sha}}"
|
||||||
|
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
|
||||||
|
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
|
||||||
|
pull: true
|
||||||
|
push: true
|
||||||
|
tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}}
|
||||||
|
|
||||||
|
docker-image-compute:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ pg_regress-tests, other-tests ]
|
||||||
|
if: |
|
||||||
|
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||||
|
github.event_name != 'workflow_dispatch'
|
||||||
|
outputs:
|
||||||
|
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Login to DockerHub
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
with:
|
||||||
|
driver: docker
|
||||||
|
|
||||||
|
- name: Get build tag
|
||||||
|
run: |
|
||||||
|
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||||
|
echo "::set-output name=tag::$(git rev-list --count HEAD)"
|
||||||
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
|
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
|
||||||
|
else
|
||||||
|
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
id: build-tag
|
||||||
|
|
||||||
|
- name: Get legacy build tag
|
||||||
|
run: |
|
||||||
|
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||||
|
echo "::set-output name=tag::latest
|
||||||
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
|
echo "::set-output name=tag::release
|
||||||
|
else
|
||||||
|
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
id: legacy-build-tag
|
||||||
|
|
||||||
|
- name: Build compute-tools Docker image
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
build-args: |
|
||||||
|
GIT_VERSION="${{github.sha}}"
|
||||||
|
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
|
||||||
|
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
|
||||||
|
push: false
|
||||||
|
file: Dockerfile.compute-tools
|
||||||
|
tags: neondatabase/compute-tools:local
|
||||||
|
|
||||||
|
- name: Push compute-tools Docker image
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
build-args: |
|
||||||
|
GIT_VERSION="${{github.sha}}"
|
||||||
|
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
|
||||||
|
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
|
||||||
|
push: true
|
||||||
|
file: Dockerfile.compute-tools
|
||||||
|
tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}}
|
||||||
|
|
||||||
|
- name: Build compute-node Docker image
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: ./vendor/postgres/
|
||||||
|
build-args:
|
||||||
|
COMPUTE_TOOLS_TAG=local
|
||||||
|
push: true
|
||||||
|
tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}}
|
||||||
|
|
||||||
|
calculate-deploy-targets:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
if: |
|
||||||
|
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||||
|
github.event_name != 'workflow_dispatch'
|
||||||
|
outputs:
|
||||||
|
matrix-include: ${{ steps.set-matrix.outputs.include }}
|
||||||
|
steps:
|
||||||
|
- id: set-matrix
|
||||||
|
run: |
|
||||||
|
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||||
|
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}'
|
||||||
|
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}'
|
||||||
|
echo "::set-output name=include::[$STAGING, $NEON_STRESS]"
|
||||||
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
|
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}'
|
||||||
|
echo "::set-output name=include::[$PRODUCTION]"
|
||||||
|
else
|
||||||
|
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
deploy:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
# We need both storage **and** compute images for deploy, because control plane
|
||||||
|
# picks the compute version based on the storage version. If it notices a fresh
|
||||||
|
# storage it may bump the compute version. And if compute image failed to build
|
||||||
|
# it may break things badly.
|
||||||
|
needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
|
||||||
|
if: |
|
||||||
|
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||||
|
github.event_name != 'workflow_dispatch'
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Setup ansible
|
||||||
|
run: |
|
||||||
|
pip install --progress-bar off --user ansible boto3
|
||||||
|
|
||||||
|
- name: Redeploy
|
||||||
|
run: |
|
||||||
|
cd "$(pwd)/.github/ansible"
|
||||||
|
|
||||||
|
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||||
|
./get_binaries.sh
|
||||||
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
|
RELEASE=true ./get_binaries.sh
|
||||||
|
else
|
||||||
|
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
eval $(ssh-agent)
|
||||||
|
echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key
|
||||||
|
echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
||||||
|
chmod 0600 ssh-key
|
||||||
|
ssh-add ssh-key
|
||||||
|
rm -f ssh-key ssh-key-cert.pub
|
||||||
|
|
||||||
|
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts
|
||||||
|
rm -f neon_install.tar.gz .neon_current_version
|
||||||
|
|
||||||
|
deploy-proxy:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
# Compute image isn't strictly required for proxy deploy, but let's still wait for it
|
||||||
|
# to run all deploy jobs consistently.
|
||||||
|
needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
|
||||||
|
if: |
|
||||||
|
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||||
|
github.event_name != 'workflow_dispatch'
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
|
||||||
|
env:
|
||||||
|
KUBECONFIG: .kubeconfig
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Store kubeconfig file
|
||||||
|
run: |
|
||||||
|
echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
|
||||||
|
chmod 0600 ${KUBECONFIG}
|
||||||
|
|
||||||
|
- name: Setup helm v3
|
||||||
|
run: |
|
||||||
|
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||||
|
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||||
|
|
||||||
|
- name: Re-deploy proxy
|
||||||
|
run: |
|
||||||
|
DOCKER_TAG=${{needs.docker-image.outputs.build-tag}}
|
||||||
|
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||||
|
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
name: Build and Test
|
name: Check code style and build
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
@@ -6,15 +6,27 @@ on:
|
|||||||
- main
|
- main
|
||||||
pull_request:
|
pull_request:
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash -ex {0}
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
RUST_BACKTRACE: 1
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
regression-check:
|
check-codestyle-rust:
|
||||||
strategy:
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
# If we want to duplicate this job for different
|
# If we want to duplicate this job for different
|
||||||
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
|
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
|
||||||
rust_toolchain: [1.58]
|
rust_toolchain: [1.58]
|
||||||
os: [ubuntu-latest, macos-latest]
|
os: [ubuntu-latest, macos-latest]
|
||||||
timeout-minutes: 30
|
timeout-minutes: 50
|
||||||
name: run regression test suite
|
name: run regression test suite
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
@@ -92,5 +104,30 @@ jobs:
|
|||||||
- name: Run cargo clippy
|
- name: Run cargo clippy
|
||||||
run: ./run_clippy.sh
|
run: ./run_clippy.sh
|
||||||
|
|
||||||
- name: Run cargo test
|
- name: Ensure all project builds
|
||||||
run: cargo test --all --all-targets
|
run: cargo build --all --all-targets
|
||||||
|
|
||||||
|
check-codestyle-python:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: false
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Cache poetry deps
|
||||||
|
id: cache_poetry
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: ~/.cache/pypoetry/virtualenvs
|
||||||
|
key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
|
||||||
|
|
||||||
|
- name: Install Python deps
|
||||||
|
run: ./scripts/pysync
|
||||||
|
|
||||||
|
- name: Run yapf to ensure code format
|
||||||
|
run: poetry run yapf --recursive --diff .
|
||||||
|
|
||||||
|
- name: Run mypy to check types
|
||||||
|
run: poetry run mypy .
|
||||||
71
.github/workflows/pg_clients.yml
vendored
Normal file
71
.github/workflows/pg_clients.yml
vendored
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
name: Test Postgres client libraries
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
# * is a special character in YAML so you have to quote this string
|
||||||
|
# ┌───────────── minute (0 - 59)
|
||||||
|
# │ ┌───────────── hour (0 - 23)
|
||||||
|
# │ │ ┌───────────── day of the month (1 - 31)
|
||||||
|
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||||
|
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||||
|
- cron: '23 02 * * *' # run once a day, timezone is utc
|
||||||
|
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test-postgres-client-libs:
|
||||||
|
runs-on: [ ubuntu-latest ]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: 3.9
|
||||||
|
|
||||||
|
- name: Install Poetry
|
||||||
|
uses: snok/install-poetry@v1
|
||||||
|
|
||||||
|
- name: Cache poetry deps
|
||||||
|
id: cache_poetry
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: ~/.cache/pypoetry/virtualenvs
|
||||||
|
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||||
|
|
||||||
|
- name: Install Python deps
|
||||||
|
shell: bash -ex {0}
|
||||||
|
run: ./scripts/pysync
|
||||||
|
|
||||||
|
- name: Run pytest
|
||||||
|
env:
|
||||||
|
REMOTE_ENV: 1
|
||||||
|
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||||
|
TEST_OUTPUT: /tmp/test_output
|
||||||
|
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||||
|
shell: bash -ex {0}
|
||||||
|
run: |
|
||||||
|
# Test framework expects we have psql binary;
|
||||||
|
# but since we don't really need it in this test, let's mock it
|
||||||
|
mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql";
|
||||||
|
./scripts/pytest \
|
||||||
|
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||||
|
--tb=short \
|
||||||
|
--verbose \
|
||||||
|
-m "remote_cluster" \
|
||||||
|
-rA "test_runner/pg_clients"
|
||||||
|
|
||||||
|
- name: Post to a Slack channel
|
||||||
|
if: failure()
|
||||||
|
id: slack
|
||||||
|
uses: slackapi/slack-github-action@v1
|
||||||
|
with:
|
||||||
|
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||||
|
slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
|
env:
|
||||||
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -6,8 +6,8 @@ __pycache__/
|
|||||||
test_output/
|
test_output/
|
||||||
.vscode
|
.vscode
|
||||||
.idea
|
.idea
|
||||||
/.zenith
|
/.neon
|
||||||
/integration_tests/.zenith
|
/integration_tests/.neon
|
||||||
|
|
||||||
# Coverage
|
# Coverage
|
||||||
*.profraw
|
*.profraw
|
||||||
|
|||||||
@@ -6,5 +6,5 @@ target/
|
|||||||
tmp_install/
|
tmp_install/
|
||||||
__pycache__/
|
__pycache__/
|
||||||
test_output/
|
test_output/
|
||||||
.zenith/
|
.neon/
|
||||||
.git/
|
.git/
|
||||||
|
|||||||
127
Cargo.lock
generated
127
Cargo.lock
generated
@@ -64,6 +64,45 @@ dependencies = [
|
|||||||
"nodrop",
|
"nodrop",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "asn1-rs"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "30ff05a702273012438132f449575dbc804e27b2f3cbe3069aa237d26c98fa33"
|
||||||
|
dependencies = [
|
||||||
|
"asn1-rs-derive",
|
||||||
|
"asn1-rs-impl",
|
||||||
|
"displaydoc",
|
||||||
|
"nom",
|
||||||
|
"num-traits",
|
||||||
|
"rusticata-macros",
|
||||||
|
"thiserror",
|
||||||
|
"time 0.3.9",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "asn1-rs-derive"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "db8b7511298d5b7784b40b092d9e9dcd3a627a5707e4b5e507931ab0d44eeebf"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
"synstructure",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "asn1-rs-impl"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "async-stream"
|
name = "async-stream"
|
||||||
version = "0.3.3"
|
version = "0.3.3"
|
||||||
@@ -422,6 +461,7 @@ dependencies = [
|
|||||||
"tar",
|
"tar",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-postgres",
|
"tokio-postgres",
|
||||||
|
"url",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -712,6 +752,12 @@ dependencies = [
|
|||||||
"syn",
|
"syn",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "data-encoding"
|
||||||
|
version = "2.3.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3ee2393c4a91429dffb4bedf19f4d6abf27d8a732c8ce4980305d782e5426d57"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "debugid"
|
name = "debugid"
|
||||||
version = "0.7.3"
|
version = "0.7.3"
|
||||||
@@ -721,6 +767,20 @@ dependencies = [
|
|||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "der-parser"
|
||||||
|
version = "7.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fe398ac75057914d7d07307bf67dc7f3f574a26783b4fc7805a20ffa9f506e82"
|
||||||
|
dependencies = [
|
||||||
|
"asn1-rs",
|
||||||
|
"displaydoc",
|
||||||
|
"nom",
|
||||||
|
"num-bigint",
|
||||||
|
"num-traits",
|
||||||
|
"rusticata-macros",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "digest"
|
name = "digest"
|
||||||
version = "0.9.0"
|
version = "0.9.0"
|
||||||
@@ -762,6 +822,17 @@ dependencies = [
|
|||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "displaydoc"
|
||||||
|
version = "0.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "either"
|
name = "either"
|
||||||
version = "1.6.1"
|
version = "1.6.1"
|
||||||
@@ -1731,6 +1802,15 @@ dependencies = [
|
|||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "oid-registry"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "38e20717fa0541f39bd146692035c37bedfa532b3e5071b35761082407546b2a"
|
||||||
|
dependencies = [
|
||||||
|
"asn1-rs",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "once_cell"
|
name = "once_cell"
|
||||||
version = "1.10.0"
|
version = "1.10.0"
|
||||||
@@ -1842,6 +1922,7 @@ dependencies = [
|
|||||||
"tracing",
|
"tracing",
|
||||||
"url",
|
"url",
|
||||||
"utils",
|
"utils",
|
||||||
|
"walkdir",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -2070,7 +2151,7 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"utils",
|
"utils",
|
||||||
"wal_generate",
|
"wal_craft",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -2249,6 +2330,7 @@ dependencies = [
|
|||||||
"url",
|
"url",
|
||||||
"utils",
|
"utils",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
|
"x509-parser",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -2620,6 +2702,15 @@ dependencies = [
|
|||||||
"semver",
|
"semver",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rusticata-macros"
|
||||||
|
version = "4.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632"
|
||||||
|
dependencies = [
|
||||||
|
"nom",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustls"
|
name = "rustls"
|
||||||
version = "0.20.4"
|
version = "0.20.4"
|
||||||
@@ -3059,6 +3150,18 @@ version = "0.1.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8"
|
checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "synstructure"
|
||||||
|
version = "0.12.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
"unicode-xid",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tar"
|
name = "tar"
|
||||||
version = "0.4.38"
|
version = "0.4.38"
|
||||||
@@ -3650,14 +3753,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wal_generate"
|
name = "wal_craft"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"clap 3.0.14",
|
"clap 3.0.14",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"log",
|
"log",
|
||||||
|
"once_cell",
|
||||||
"postgres",
|
"postgres",
|
||||||
|
"postgres_ffi",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -3921,6 +4026,24 @@ dependencies = [
|
|||||||
"tracing-core",
|
"tracing-core",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "x509-parser"
|
||||||
|
version = "0.13.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9fb9bace5b5589ffead1afb76e43e34cff39cd0f3ce7e170ae0c29e53b88eb1c"
|
||||||
|
dependencies = [
|
||||||
|
"asn1-rs",
|
||||||
|
"base64",
|
||||||
|
"data-encoding",
|
||||||
|
"der-parser",
|
||||||
|
"lazy_static",
|
||||||
|
"nom",
|
||||||
|
"oid-registry",
|
||||||
|
"rusticata-macros",
|
||||||
|
"thiserror",
|
||||||
|
"time 0.3.9",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "xattr"
|
name = "xattr"
|
||||||
version = "0.2.2"
|
version = "0.2.2"
|
||||||
|
|||||||
10
Dockerfile
10
Dockerfile
@@ -1,5 +1,5 @@
|
|||||||
# Build Postgres
|
# Build Postgres
|
||||||
FROM zimg/rust:1.58 AS pg-build
|
FROM neondatabase/rust:1.58 AS pg-build
|
||||||
WORKDIR /pg
|
WORKDIR /pg
|
||||||
|
|
||||||
USER root
|
USER root
|
||||||
@@ -14,7 +14,7 @@ RUN set -e \
|
|||||||
&& tar -C tmp_install -czf /postgres_install.tar.gz .
|
&& tar -C tmp_install -czf /postgres_install.tar.gz .
|
||||||
|
|
||||||
# Build zenith binaries
|
# Build zenith binaries
|
||||||
FROM zimg/rust:1.58 AS build
|
FROM neondatabase/rust:1.58 AS build
|
||||||
ARG GIT_VERSION=local
|
ARG GIT_VERSION=local
|
||||||
|
|
||||||
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||||
@@ -46,9 +46,9 @@ RUN set -e \
|
|||||||
&& useradd -d /data zenith \
|
&& useradd -d /data zenith \
|
||||||
&& chown -R zenith:zenith /data
|
&& chown -R zenith:zenith /data
|
||||||
|
|
||||||
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin
|
COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin
|
||||||
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin
|
COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin
|
||||||
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy /usr/local/bin
|
COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy /usr/local/bin
|
||||||
|
|
||||||
COPY --from=pg-build /pg/tmp_install/ /usr/local/
|
COPY --from=pg-build /pg/tmp_install/ /usr/local/
|
||||||
COPY --from=pg-build /postgres_install.tar.gz /data/
|
COPY --from=pg-build /postgres_install.tar.gz /data/
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# First transient image to build compute_tools binaries
|
# First transient image to build compute_tools binaries
|
||||||
# NB: keep in sync with rust image version in .circle/config.yml
|
# NB: keep in sync with rust image version in .circle/config.yml
|
||||||
FROM zimg/rust:1.58 AS rust-build
|
FROM neondatabase/rust:1.58 AS rust-build
|
||||||
|
|
||||||
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||||
ARG AWS_ACCESS_KEY_ID
|
ARG AWS_ACCESS_KEY_ID
|
||||||
@@ -15,4 +15,4 @@ RUN set -e \
|
|||||||
# Final image that only has one binary
|
# Final image that only has one binary
|
||||||
FROM debian:buster-slim
|
FROM debian:buster-slim
|
||||||
|
|
||||||
COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl
|
COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl
|
||||||
|
|||||||
39
Makefile
39
Makefile
@@ -1,3 +1,8 @@
|
|||||||
|
ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||||
|
|
||||||
|
# Where to install Postgres, default is ./tmp_install, maybe useful for package managers
|
||||||
|
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/tmp_install
|
||||||
|
|
||||||
# Seccomp BPF is only available for Linux
|
# Seccomp BPF is only available for Linux
|
||||||
UNAME_S := $(shell uname -s)
|
UNAME_S := $(shell uname -s)
|
||||||
ifeq ($(UNAME_S),Linux)
|
ifeq ($(UNAME_S),Linux)
|
||||||
@@ -55,55 +60,55 @@ zenith: postgres-headers
|
|||||||
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
|
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
|
||||||
|
|
||||||
### PostgreSQL parts
|
### PostgreSQL parts
|
||||||
tmp_install/build/config.status:
|
$(POSTGRES_INSTALL_DIR)/build/config.status:
|
||||||
+@echo "Configuring postgres build"
|
+@echo "Configuring postgres build"
|
||||||
mkdir -p tmp_install/build
|
mkdir -p $(POSTGRES_INSTALL_DIR)/build
|
||||||
(cd tmp_install/build && \
|
(cd $(POSTGRES_INSTALL_DIR)/build && \
|
||||||
../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
|
$(ROOT_PROJECT_DIR)/vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
|
||||||
$(PG_CONFIGURE_OPTS) \
|
$(PG_CONFIGURE_OPTS) \
|
||||||
$(SECCOMP) \
|
$(SECCOMP) \
|
||||||
--prefix=$(abspath tmp_install) > configure.log)
|
--prefix=$(abspath $(POSTGRES_INSTALL_DIR)) > configure.log)
|
||||||
|
|
||||||
# nicer alias for running 'configure'
|
# nicer alias for running 'configure'
|
||||||
.PHONY: postgres-configure
|
.PHONY: postgres-configure
|
||||||
postgres-configure: tmp_install/build/config.status
|
postgres-configure: $(POSTGRES_INSTALL_DIR)/build/config.status
|
||||||
|
|
||||||
# Install the PostgreSQL header files into tmp_install/include
|
# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/include
|
||||||
.PHONY: postgres-headers
|
.PHONY: postgres-headers
|
||||||
postgres-headers: postgres-configure
|
postgres-headers: postgres-configure
|
||||||
+@echo "Installing PostgreSQL headers"
|
+@echo "Installing PostgreSQL headers"
|
||||||
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
|
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/include MAKELEVEL=0 install
|
||||||
|
|
||||||
# Compile and install PostgreSQL and contrib/neon
|
# Compile and install PostgreSQL and contrib/neon
|
||||||
.PHONY: postgres
|
.PHONY: postgres
|
||||||
postgres: postgres-configure \
|
postgres: postgres-configure \
|
||||||
postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
|
postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
|
||||||
+@echo "Compiling PostgreSQL"
|
+@echo "Compiling PostgreSQL"
|
||||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
|
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install
|
||||||
+@echo "Compiling contrib/neon"
|
+@echo "Compiling contrib/neon"
|
||||||
$(MAKE) -C tmp_install/build/contrib/neon install
|
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install
|
||||||
+@echo "Compiling contrib/neon_test_utils"
|
+@echo "Compiling contrib/neon_test_utils"
|
||||||
$(MAKE) -C tmp_install/build/contrib/neon_test_utils install
|
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install
|
||||||
+@echo "Compiling pg_buffercache"
|
+@echo "Compiling pg_buffercache"
|
||||||
$(MAKE) -C tmp_install/build/contrib/pg_buffercache install
|
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install
|
||||||
+@echo "Compiling pageinspect"
|
+@echo "Compiling pageinspect"
|
||||||
$(MAKE) -C tmp_install/build/contrib/pageinspect install
|
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install
|
||||||
|
|
||||||
|
|
||||||
.PHONY: postgres-clean
|
.PHONY: postgres-clean
|
||||||
postgres-clean:
|
postgres-clean:
|
||||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
|
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean
|
||||||
|
|
||||||
# This doesn't remove the effects of 'configure'.
|
# This doesn't remove the effects of 'configure'.
|
||||||
.PHONY: clean
|
.PHONY: clean
|
||||||
clean:
|
clean:
|
||||||
cd tmp_install/build && $(MAKE) clean
|
cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean
|
||||||
$(CARGO_CMD_PREFIX) cargo clean
|
$(CARGO_CMD_PREFIX) cargo clean
|
||||||
|
|
||||||
# This removes everything
|
# This removes everything
|
||||||
.PHONY: distclean
|
.PHONY: distclean
|
||||||
distclean:
|
distclean:
|
||||||
rm -rf tmp_install
|
rm -rf $(POSTGRES_INSTALL_DIR)
|
||||||
$(CARGO_CMD_PREFIX) cargo clean
|
$(CARGO_CMD_PREFIX) cargo clean
|
||||||
|
|
||||||
.PHONY: fmt
|
.PHONY: fmt
|
||||||
@@ -112,4 +117,4 @@ fmt:
|
|||||||
|
|
||||||
.PHONY: setup-pre-commit-hook
|
.PHONY: setup-pre-commit-hook
|
||||||
setup-pre-commit-hook:
|
setup-pre-commit-hook:
|
||||||
ln -s -f ../../pre-commit.py .git/hooks/pre-commit
|
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
||||||
|
|||||||
40
README.md
40
README.md
@@ -29,7 +29,7 @@ Pageserver consists of:
|
|||||||
## Running local installation
|
## Running local installation
|
||||||
|
|
||||||
|
|
||||||
#### building on Linux
|
#### Installing dependencies on Linux
|
||||||
1. Install build dependencies and other useful packages
|
1. Install build dependencies and other useful packages
|
||||||
|
|
||||||
* On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
* On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||||
@@ -49,18 +49,11 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
|
|||||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Build neon and patched postgres
|
#### Installing dependencies on OSX (12.3.1)
|
||||||
```sh
|
|
||||||
git clone --recursive https://github.com/neondatabase/neon.git
|
|
||||||
cd neon
|
|
||||||
make -j`nproc`
|
|
||||||
```
|
|
||||||
|
|
||||||
#### building on OSX (12.3.1)
|
|
||||||
1. Install XCode and dependencies
|
1. Install XCode and dependencies
|
||||||
```
|
```
|
||||||
xcode-select --install
|
xcode-select --install
|
||||||
brew install protobuf etcd
|
brew install protobuf etcd openssl
|
||||||
```
|
```
|
||||||
|
|
||||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||||
@@ -76,11 +69,20 @@ brew install libpq
|
|||||||
brew link --force libpq
|
brew link --force libpq
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Build neon and patched postgres
|
#### Building on Linux and OSX
|
||||||
```sh
|
|
||||||
|
1. Build neon and patched postgres
|
||||||
|
```
|
||||||
|
# Note: The path to the neon sources can not contain a space.
|
||||||
|
|
||||||
git clone --recursive https://github.com/neondatabase/neon.git
|
git clone --recursive https://github.com/neondatabase/neon.git
|
||||||
cd neon
|
cd neon
|
||||||
make -j5
|
|
||||||
|
# The preferred and default is to make a debug build. This will create a
|
||||||
|
# demonstrably slower build than a release build. If you want to use a release
|
||||||
|
# build, utilize "`BUILD_TYPE=release make -j`nproc``"
|
||||||
|
|
||||||
|
make -j`nproc`
|
||||||
```
|
```
|
||||||
|
|
||||||
#### dependency installation notes
|
#### dependency installation notes
|
||||||
@@ -93,7 +95,7 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
|
|||||||
#### running neon database
|
#### running neon database
|
||||||
1. Start pageserver and postgres on top of it (should be called from repo root):
|
1. Start pageserver and postgres on top of it (should be called from repo root):
|
||||||
```sh
|
```sh
|
||||||
# Create repository in .zenith with proper paths to binaries and data
|
# Create repository in .neon with proper paths to binaries and data
|
||||||
# Later that would be responsibility of a package install script
|
# Later that would be responsibility of a package install script
|
||||||
> ./target/debug/neon_local init
|
> ./target/debug/neon_local init
|
||||||
initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
|
initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
|
||||||
@@ -103,16 +105,16 @@ pageserver init succeeded
|
|||||||
|
|
||||||
# start pageserver and safekeeper
|
# start pageserver and safekeeper
|
||||||
> ./target/debug/neon_local start
|
> ./target/debug/neon_local start
|
||||||
Starting pageserver at '127.0.0.1:64000' in '.zenith'
|
Starting pageserver at '127.0.0.1:64000' in '.neon'
|
||||||
Pageserver started
|
Pageserver started
|
||||||
initializing for sk 1 for 7676
|
initializing for sk 1 for 7676
|
||||||
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1'
|
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'
|
||||||
Safekeeper started
|
Safekeeper started
|
||||||
|
|
||||||
# start postgres compute node
|
# start postgres compute node
|
||||||
> ./target/debug/neon_local pg start main
|
> ./target/debug/neon_local pg start main
|
||||||
Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
|
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
|
||||||
Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
|
Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
|
||||||
|
|
||||||
# check list of running postgres instances
|
# check list of running postgres instances
|
||||||
@@ -149,7 +151,7 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
|
|||||||
# start postgres on that branch
|
# start postgres on that branch
|
||||||
> ./target/debug/neon_local pg start migration_check --branch-name migration_check
|
> ./target/debug/neon_local pg start migration_check --branch-name migration_check
|
||||||
Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
||||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
|
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
|
||||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
|
Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
|
||||||
|
|
||||||
# check the new list of running postgres instances
|
# check the new list of running postgres instances
|
||||||
@@ -209,7 +211,7 @@ Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, wh
|
|||||||
To get more familiar with this aspect, refer to:
|
To get more familiar with this aspect, refer to:
|
||||||
|
|
||||||
- [Neon glossary](/docs/glossary.md)
|
- [Neon glossary](/docs/glossary.md)
|
||||||
- [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html)
|
- [PostgreSQL glossary](https://www.postgresql.org/docs/14/glossary.html)
|
||||||
- Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres))
|
- Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres))
|
||||||
|
|
||||||
## Join the development
|
## Join the development
|
||||||
|
|||||||
@@ -18,4 +18,5 @@ serde_json = "1"
|
|||||||
tar = "0.4"
|
tar = "0.4"
|
||||||
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
|
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
|
||||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||||
|
url = "2.2.2"
|
||||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ use std::process::exit;
|
|||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use std::{thread, time::Duration};
|
use std::{thread, time::Duration};
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::{Context, Result};
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use clap::Arg;
|
use clap::Arg;
|
||||||
use log::{error, info};
|
use log::{error, info};
|
||||||
@@ -45,6 +45,7 @@ use compute_tools::monitor::launch_monitor;
|
|||||||
use compute_tools::params::*;
|
use compute_tools::params::*;
|
||||||
use compute_tools::pg_helpers::*;
|
use compute_tools::pg_helpers::*;
|
||||||
use compute_tools::spec::*;
|
use compute_tools::spec::*;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
// TODO: re-use `utils::logging` later
|
// TODO: re-use `utils::logging` later
|
||||||
@@ -131,7 +132,7 @@ fn main() -> Result<()> {
|
|||||||
|
|
||||||
let compute_state = ComputeNode {
|
let compute_state = ComputeNode {
|
||||||
start_time: Utc::now(),
|
start_time: Utc::now(),
|
||||||
connstr: connstr.to_string(),
|
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
|
||||||
pgdata: pgdata.to_string(),
|
pgdata: pgdata.to_string(),
|
||||||
pgbin: pgbin.to_string(),
|
pgbin: pgbin.to_string(),
|
||||||
spec,
|
spec,
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Result};
|
||||||
use log::error;
|
use log::error;
|
||||||
use postgres::Client;
|
use postgres::Client;
|
||||||
@@ -23,9 +21,8 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn check_writability(compute: &Arc<ComputeNode>) -> Result<()> {
|
pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
|
||||||
let connstr = &compute.connstr;
|
let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
|
||||||
let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?;
|
|
||||||
if client.is_closed() {
|
if client.is_closed() {
|
||||||
return Err(anyhow!("connection to postgres closed"));
|
return Err(anyhow!("connection to postgres closed"));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -35,7 +35,8 @@ use crate::spec::*;
|
|||||||
/// Compute node info shared across several `compute_ctl` threads.
|
/// Compute node info shared across several `compute_ctl` threads.
|
||||||
pub struct ComputeNode {
|
pub struct ComputeNode {
|
||||||
pub start_time: DateTime<Utc>,
|
pub start_time: DateTime<Utc>,
|
||||||
pub connstr: String,
|
// Url type maintains proper escaping
|
||||||
|
pub connstr: url::Url,
|
||||||
pub pgdata: String,
|
pub pgdata: String,
|
||||||
pub pgbin: String,
|
pub pgbin: String,
|
||||||
pub spec: ComputeSpec,
|
pub spec: ComputeSpec,
|
||||||
@@ -268,28 +269,33 @@ impl ComputeNode {
|
|||||||
// In this case we need to connect with old `zenith_admin`name
|
// In this case we need to connect with old `zenith_admin`name
|
||||||
// and create new user. We cannot simply rename connected user,
|
// and create new user. We cannot simply rename connected user,
|
||||||
// but we can create a new one and grant it all privileges.
|
// but we can create a new one and grant it all privileges.
|
||||||
let mut client = match Client::connect(&self.connstr, NoTls) {
|
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
info!(
|
info!(
|
||||||
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
||||||
e
|
e
|
||||||
);
|
);
|
||||||
let zenith_admin_connstr = self.connstr.replacen("cloud_admin", "zenith_admin", 1);
|
let mut zenith_admin_connstr = self.connstr.clone();
|
||||||
|
|
||||||
let mut client = Client::connect(&zenith_admin_connstr, NoTls)?;
|
zenith_admin_connstr
|
||||||
|
.set_username("zenith_admin")
|
||||||
|
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
|
||||||
|
|
||||||
|
let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
|
||||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||||
drop(client);
|
drop(client);
|
||||||
|
|
||||||
// reconnect with connsting with expected name
|
// reconnect with connsting with expected name
|
||||||
Client::connect(&self.connstr, NoTls)?
|
Client::connect(self.connstr.as_str(), NoTls)?
|
||||||
}
|
}
|
||||||
Ok(client) => client,
|
Ok(client) => client,
|
||||||
};
|
};
|
||||||
|
|
||||||
handle_roles(&self.spec, &mut client)?;
|
handle_roles(&self.spec, &mut client)?;
|
||||||
handle_databases(&self.spec, &mut client)?;
|
handle_databases(&self.spec, &mut client)?;
|
||||||
handle_grants(&self.spec, &mut client)?;
|
handle_role_deletions(self, &mut client)?;
|
||||||
|
handle_grants(self, &mut client)?;
|
||||||
create_writablity_check_data(&mut client)?;
|
create_writablity_check_data(&mut client)?;
|
||||||
|
|
||||||
// 'Close' connection
|
// 'Close' connection
|
||||||
|
|||||||
@@ -13,11 +13,11 @@ const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
|
|||||||
// Spin in a loop and figure out the last activity time in the Postgres.
|
// Spin in a loop and figure out the last activity time in the Postgres.
|
||||||
// Then update it in the shared state. This function never errors out.
|
// Then update it in the shared state. This function never errors out.
|
||||||
// XXX: the only expected panic is at `RwLock` unwrap().
|
// XXX: the only expected panic is at `RwLock` unwrap().
|
||||||
fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
fn watch_compute_activity(compute: &ComputeNode) {
|
||||||
// Suppose that `connstr` doesn't change
|
// Suppose that `connstr` doesn't change
|
||||||
let connstr = compute.connstr.clone();
|
let connstr = compute.connstr.as_str();
|
||||||
// Define `client` outside of the loop to reuse existing connection if it's active.
|
// Define `client` outside of the loop to reuse existing connection if it's active.
|
||||||
let mut client = Client::connect(&connstr, NoTls);
|
let mut client = Client::connect(connstr, NoTls);
|
||||||
let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
|
let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
|
||||||
|
|
||||||
info!("watching Postgres activity at {}", connstr);
|
info!("watching Postgres activity at {}", connstr);
|
||||||
@@ -32,7 +32,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
|||||||
info!("connection to postgres closed, trying to reconnect");
|
info!("connection to postgres closed, trying to reconnect");
|
||||||
|
|
||||||
// Connection is closed, reconnect and try again.
|
// Connection is closed, reconnect and try again.
|
||||||
client = Client::connect(&connstr, NoTls);
|
client = Client::connect(connstr, NoTls);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -93,7 +93,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
|||||||
debug!("cannot connect to postgres: {}, retrying", e);
|
debug!("cannot connect to postgres: {}, retrying", e);
|
||||||
|
|
||||||
// Establish a new connection and try again.
|
// Establish a new connection and try again.
|
||||||
client = Client::connect(&connstr, NoTls);
|
client = Client::connect(connstr, NoTls);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use std::fmt::Write;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{BufRead, BufReader};
|
use std::io::{BufRead, BufReader};
|
||||||
use std::net::{SocketAddr, TcpStream};
|
use std::net::{SocketAddr, TcpStream};
|
||||||
@@ -138,9 +139,11 @@ impl Role {
|
|||||||
// Now we also support SCRAM-SHA-256 and to preserve compatibility
|
// Now we also support SCRAM-SHA-256 and to preserve compatibility
|
||||||
// we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
|
// we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
|
||||||
if pass.starts_with("SCRAM-SHA-256") {
|
if pass.starts_with("SCRAM-SHA-256") {
|
||||||
params.push_str(&format!(" PASSWORD '{}'", pass));
|
write!(params, " PASSWORD '{pass}'")
|
||||||
|
.expect("String is documented to not to error during write operations");
|
||||||
} else {
|
} else {
|
||||||
params.push_str(&format!(" PASSWORD 'md5{}'", pass));
|
write!(params, " PASSWORD 'md5{pass}'")
|
||||||
|
.expect("String is documented to not to error during write operations");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
params.push_str(" PASSWORD NULL");
|
params.push_str(" PASSWORD NULL");
|
||||||
@@ -158,7 +161,8 @@ impl Database {
|
|||||||
/// it may require a proper quoting too.
|
/// it may require a proper quoting too.
|
||||||
pub fn to_pg_options(&self) -> String {
|
pub fn to_pg_options(&self) -> String {
|
||||||
let mut params: String = self.options.as_pg_options();
|
let mut params: String = self.options.as_pg_options();
|
||||||
params.push_str(&format!(" OWNER {}", &self.owner.quote()));
|
write!(params, " OWNER {}", &self.owner.quote())
|
||||||
|
.expect("String is documented to not to error during write operations");
|
||||||
|
|
||||||
params
|
params
|
||||||
}
|
}
|
||||||
@@ -244,18 +248,20 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()
|
|||||||
bail!("Postgres exited unexpectedly with code {}", code);
|
bail!("Postgres exited unexpectedly with code {}", code);
|
||||||
}
|
}
|
||||||
|
|
||||||
if pid_path.exists() {
|
// Check that we can open pid file first.
|
||||||
let file = BufReader::new(File::open(&pid_path)?);
|
if let Ok(file) = File::open(&pid_path) {
|
||||||
let status = file
|
let file = BufReader::new(file);
|
||||||
.lines()
|
let last_line = file.lines().last();
|
||||||
.last()
|
|
||||||
.unwrap()
|
|
||||||
.unwrap_or_else(|_| "unknown".to_string());
|
|
||||||
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
|
|
||||||
|
|
||||||
// Now Postgres is ready to accept connections
|
// Pid file could be there and we could read it, but it could be empty, for example.
|
||||||
if status.trim() == "ready" && can_connect {
|
if let Some(Ok(line)) = last_line {
|
||||||
break;
|
let status = line.trim();
|
||||||
|
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
|
||||||
|
|
||||||
|
// Now Postgres is ready to accept connections
|
||||||
|
if status == "ready" && can_connect {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::{anyhow, Result};
|
||||||
use log::{info, log_enabled, warn, Level};
|
use log::{info, log_enabled, warn, Level};
|
||||||
use postgres::Client;
|
use postgres::error::SqlState;
|
||||||
|
use postgres::{Client, NoTls};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
|
|
||||||
|
use crate::compute::ComputeNode;
|
||||||
use crate::config;
|
use crate::config;
|
||||||
use crate::params::PG_HBA_ALL_MD5;
|
use crate::params::PG_HBA_ALL_MD5;
|
||||||
use crate::pg_helpers::*;
|
use crate::pg_helpers::*;
|
||||||
@@ -97,18 +99,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
|
|
||||||
// Process delta operations first
|
// Process delta operations first
|
||||||
if let Some(ops) = &spec.delta_operations {
|
if let Some(ops) = &spec.delta_operations {
|
||||||
info!("processing delta operations on roles");
|
info!("processing role renames");
|
||||||
for op in ops {
|
for op in ops {
|
||||||
match op.action.as_ref() {
|
match op.action.as_ref() {
|
||||||
// We do not check either role exists or not,
|
|
||||||
// Postgres will take care of it for us
|
|
||||||
"delete_role" => {
|
"delete_role" => {
|
||||||
let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
|
// no-op now, roles will be deleted at the end of configuration
|
||||||
|
|
||||||
warn!("deleting role '{}'", &op.name);
|
|
||||||
xact.execute(query.as_str(), &[])?;
|
|
||||||
}
|
}
|
||||||
// Renaming role drops its password, since tole name is
|
// Renaming role drops its password, since role name is
|
||||||
// used as a salt there. It is important that this role
|
// used as a salt there. It is important that this role
|
||||||
// is recorded with a new `name` in the `roles` list.
|
// is recorded with a new `name` in the `roles` list.
|
||||||
// Follow up roles update will set the new password.
|
// Follow up roles update will set the new password.
|
||||||
@@ -182,7 +179,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
xact.execute(query.as_str(), &[])?;
|
xact.execute(query.as_str(), &[])?;
|
||||||
|
|
||||||
let grant_query = format!(
|
let grant_query = format!(
|
||||||
"grant pg_read_all_data, pg_write_all_data to {}",
|
"GRANT pg_read_all_data, pg_write_all_data TO {}",
|
||||||
name.quote()
|
name.quote()
|
||||||
);
|
);
|
||||||
xact.execute(grant_query.as_str(), &[])?;
|
xact.execute(grant_query.as_str(), &[])?;
|
||||||
@@ -197,6 +194,70 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reassign all dependent objects and delete requested roles.
|
||||||
|
pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||||
|
let spec = &node.spec;
|
||||||
|
|
||||||
|
// First, reassign all dependent objects to db owners.
|
||||||
|
if let Some(ops) = &spec.delta_operations {
|
||||||
|
info!("reassigning dependent objects of to-be-deleted roles");
|
||||||
|
for op in ops {
|
||||||
|
if op.action == "delete_role" {
|
||||||
|
reassign_owned_objects(node, &op.name)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second, proceed with role deletions.
|
||||||
|
let mut xact = client.transaction()?;
|
||||||
|
if let Some(ops) = &spec.delta_operations {
|
||||||
|
info!("processing role deletions");
|
||||||
|
for op in ops {
|
||||||
|
// We do not check either role exists or not,
|
||||||
|
// Postgres will take care of it for us
|
||||||
|
if op.action == "delete_role" {
|
||||||
|
let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
|
||||||
|
|
||||||
|
warn!("deleting role '{}'", &op.name);
|
||||||
|
xact.execute(query.as_str(), &[])?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reassign all owned objects in all databases to the owner of the database.
|
||||||
|
fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
|
||||||
|
for db in &node.spec.cluster.databases {
|
||||||
|
if db.owner != *role_name {
|
||||||
|
let mut connstr = node.connstr.clone();
|
||||||
|
// database name is always the last and the only component of the path
|
||||||
|
connstr.set_path(&db.name);
|
||||||
|
|
||||||
|
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||||
|
|
||||||
|
// This will reassign all dependent objects to the db owner
|
||||||
|
let reassign_query = format!(
|
||||||
|
"REASSIGN OWNED BY {} TO {}",
|
||||||
|
role_name.quote(),
|
||||||
|
db.owner.quote()
|
||||||
|
);
|
||||||
|
info!(
|
||||||
|
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||||
|
role_name, &db.name, &db.owner
|
||||||
|
);
|
||||||
|
client.simple_query(&reassign_query)?;
|
||||||
|
|
||||||
|
// This now will only drop privileges of the role
|
||||||
|
let drop_query = format!("DROP OWNED BY {}", role_name.quote());
|
||||||
|
client.simple_query(&drop_query)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// It follows mostly the same logic as `handle_roles()` excepting that we
|
/// It follows mostly the same logic as `handle_roles()` excepting that we
|
||||||
/// does not use an explicit transactions block, since major database operations
|
/// does not use an explicit transactions block, since major database operations
|
||||||
/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
|
/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
|
||||||
@@ -289,23 +350,66 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// Grant CREATE ON DATABASE to the database owner
|
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
|
||||||
// to allow clients create trusted extensions.
|
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
|
||||||
pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||||
|
let spec = &node.spec;
|
||||||
|
|
||||||
info!("cluster spec grants:");
|
info!("cluster spec grants:");
|
||||||
|
|
||||||
|
// We now have a separate `web_access` role to connect to the database
|
||||||
|
// via the web interface and proxy link auth. And also we grant a
|
||||||
|
// read / write all data privilege to every role. So also grant
|
||||||
|
// create to everyone.
|
||||||
|
// XXX: later we should stop messing with Postgres ACL in such horrible
|
||||||
|
// ways.
|
||||||
|
let roles = spec
|
||||||
|
.cluster
|
||||||
|
.roles
|
||||||
|
.iter()
|
||||||
|
.map(|r| r.name.quote())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
for db in &spec.cluster.databases {
|
for db in &spec.cluster.databases {
|
||||||
let dbname = &db.name;
|
let dbname = &db.name;
|
||||||
|
|
||||||
let query: String = format!(
|
let query: String = format!(
|
||||||
"GRANT CREATE ON DATABASE {} TO {}",
|
"GRANT CREATE ON DATABASE {} TO {}",
|
||||||
dbname.quote(),
|
dbname.quote(),
|
||||||
db.owner.quote()
|
roles.join(", ")
|
||||||
);
|
);
|
||||||
info!("grant query {}", &query);
|
info!("grant query {}", &query);
|
||||||
|
|
||||||
client.execute(query.as_str(), &[])?;
|
client.execute(query.as_str(), &[])?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Do some per-database access adjustments. We'd better do this at db creation time,
|
||||||
|
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
|
||||||
|
// atomically.
|
||||||
|
let mut db_connstr = node.connstr.clone();
|
||||||
|
for db in &node.spec.cluster.databases {
|
||||||
|
// database name is always the last and the only component of the path
|
||||||
|
db_connstr.set_path(&db.name);
|
||||||
|
|
||||||
|
let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?;
|
||||||
|
|
||||||
|
// This will only change ownership on the schema itself, not the objects
|
||||||
|
// inside it. Without it owner of the `public` schema will be `cloud_admin`
|
||||||
|
// and database owner cannot do anything with it.
|
||||||
|
let alter_query = format!("ALTER SCHEMA public OWNER TO {}", db.owner.quote());
|
||||||
|
let res = db_client.simple_query(&alter_query);
|
||||||
|
|
||||||
|
if let Err(e) = res {
|
||||||
|
if e.code() == Some(&SqlState::INVALID_SCHEMA_NAME) {
|
||||||
|
// This is OK, db just don't have a `public` schema.
|
||||||
|
// Probably user dropped it manually.
|
||||||
|
info!("no 'public' schema found in the database {}", db.name);
|
||||||
|
} else {
|
||||||
|
// Something different happened, propagate the error
|
||||||
|
return Err(anyhow!(e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,9 +21,9 @@ use utils::{
|
|||||||
use crate::safekeeper::SafekeeperNode;
|
use crate::safekeeper::SafekeeperNode;
|
||||||
|
|
||||||
//
|
//
|
||||||
// This data structures represents zenith CLI config
|
// This data structures represents neon_local CLI config
|
||||||
//
|
//
|
||||||
// It is deserialized from the .zenith/config file, or the config file passed
|
// It is deserialized from the .neon/config file, or the config file passed
|
||||||
// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
|
// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
|
||||||
// an example.
|
// an example.
|
||||||
//
|
//
|
||||||
@@ -34,8 +34,8 @@ pub struct LocalEnv {
|
|||||||
// compute nodes).
|
// compute nodes).
|
||||||
//
|
//
|
||||||
// This is not stored in the config file. Rather, this is the path where the
|
// This is not stored in the config file. Rather, this is the path where the
|
||||||
// config file itself is. It is read from the ZENITH_REPO_DIR env variable or
|
// config file itself is. It is read from the NEON_REPO_DIR env variable or
|
||||||
// '.zenith' if not given.
|
// '.neon' if not given.
|
||||||
#[serde(skip)]
|
#[serde(skip)]
|
||||||
pub base_data_dir: PathBuf,
|
pub base_data_dir: PathBuf,
|
||||||
|
|
||||||
@@ -177,6 +177,7 @@ pub struct SafekeeperConf {
|
|||||||
pub sync: bool,
|
pub sync: bool,
|
||||||
pub remote_storage: Option<String>,
|
pub remote_storage: Option<String>,
|
||||||
pub backup_threads: Option<u32>,
|
pub backup_threads: Option<u32>,
|
||||||
|
pub auth_enabled: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for SafekeeperConf {
|
impl Default for SafekeeperConf {
|
||||||
@@ -188,6 +189,7 @@ impl Default for SafekeeperConf {
|
|||||||
sync: true,
|
sync: true,
|
||||||
remote_storage: None,
|
remote_storage: None,
|
||||||
backup_threads: None,
|
backup_threads: None,
|
||||||
|
auth_enabled: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -337,7 +339,7 @@ impl LocalEnv {
|
|||||||
pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
|
pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
|
||||||
// Currently, the user first passes a config file with 'zenith init --config=<path>'
|
// Currently, the user first passes a config file with 'zenith init --config=<path>'
|
||||||
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
|
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
|
||||||
// to .zenith/config. TODO: We lose any formatting and comments along the way, which is
|
// to .neon/config. TODO: We lose any formatting and comments along the way, which is
|
||||||
// a bit sad.
|
// a bit sad.
|
||||||
let mut conf_content = r#"# This file describes a locale deployment of the page server
|
let mut conf_content = r#"# This file describes a locale deployment of the page server
|
||||||
# and safekeeeper node. It is read by the 'zenith' command-line
|
# and safekeeeper node. It is read by the 'zenith' command-line
|
||||||
@@ -401,16 +403,6 @@ impl LocalEnv {
|
|||||||
self.pg_distrib_dir.display()
|
self.pg_distrib_dir.display()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
for binary in ["pageserver", "safekeeper"] {
|
|
||||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
|
||||||
bail!(
|
|
||||||
"Can't find binary '{}' in zenith distrib dir '{}'",
|
|
||||||
binary,
|
|
||||||
self.zenith_distrib_dir.display()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for binary in ["pageserver", "safekeeper"] {
|
for binary in ["pageserver", "safekeeper"] {
|
||||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
if !self.zenith_distrib_dir.join(binary).exists() {
|
||||||
bail!(
|
bail!(
|
||||||
@@ -419,12 +411,6 @@ impl LocalEnv {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !self.pg_distrib_dir.join("bin/postgres").exists() {
|
|
||||||
bail!(
|
|
||||||
"Can't find postgres binary at {}",
|
|
||||||
self.pg_distrib_dir.display()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
fs::create_dir(&base_path)?;
|
fs::create_dir(&base_path)?;
|
||||||
|
|
||||||
@@ -481,9 +467,9 @@ impl LocalEnv {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn base_path() -> PathBuf {
|
fn base_path() -> PathBuf {
|
||||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
match std::env::var_os("NEON_REPO_DIR") {
|
||||||
Some(val) => PathBuf::from(val),
|
Some(val) => PathBuf::from(val),
|
||||||
None => PathBuf::from(".zenith"),
|
None => PathBuf::from(".neon"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -149,6 +149,11 @@ impl SafekeeperNode {
|
|||||||
if let Some(ref remote_storage) = self.conf.remote_storage {
|
if let Some(ref remote_storage) = self.conf.remote_storage {
|
||||||
cmd.args(&["--remote-storage", remote_storage]);
|
cmd.args(&["--remote-storage", remote_storage]);
|
||||||
}
|
}
|
||||||
|
if self.conf.auth_enabled {
|
||||||
|
cmd.arg("--auth-validation-public-key-path");
|
||||||
|
// PathBuf is better be passed as is, not via `String`.
|
||||||
|
cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
|
||||||
|
}
|
||||||
|
|
||||||
fill_aws_secrets_vars(&mut cmd);
|
fill_aws_secrets_vars(&mut cmd);
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::io::Write;
|
use std::fs::File;
|
||||||
|
use std::io::{BufReader, Write};
|
||||||
use std::net::TcpStream;
|
use std::net::TcpStream;
|
||||||
use std::num::NonZeroU64;
|
use std::num::NonZeroU64;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
@@ -527,4 +528,54 @@ impl PageServerNode {
|
|||||||
|
|
||||||
Ok(timeline_info_response)
|
Ok(timeline_info_response)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Import a basebackup prepared using either:
|
||||||
|
/// a) `pg_basebackup -F tar`, or
|
||||||
|
/// b) The `fullbackup` pageserver endpoint
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `tenant_id` - tenant to import into. Created if not exists
|
||||||
|
/// * `timeline_id` - id to assign to imported timeline
|
||||||
|
/// * `base` - (start lsn of basebackup, path to `base.tar` file)
|
||||||
|
/// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
|
||||||
|
pub fn timeline_import(
|
||||||
|
&self,
|
||||||
|
tenant_id: ZTenantId,
|
||||||
|
timeline_id: ZTimelineId,
|
||||||
|
base: (Lsn, PathBuf),
|
||||||
|
pg_wal: Option<(Lsn, PathBuf)>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
|
||||||
|
|
||||||
|
// Init base reader
|
||||||
|
let (start_lsn, base_tarfile_path) = base;
|
||||||
|
let base_tarfile = File::open(base_tarfile_path)?;
|
||||||
|
let mut base_reader = BufReader::new(base_tarfile);
|
||||||
|
|
||||||
|
// Init wal reader if necessary
|
||||||
|
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
|
||||||
|
let wal_tarfile = File::open(wal_tarfile_path)?;
|
||||||
|
let wal_reader = BufReader::new(wal_tarfile);
|
||||||
|
(end_lsn, Some(wal_reader))
|
||||||
|
} else {
|
||||||
|
(start_lsn, None)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Import base
|
||||||
|
let import_cmd =
|
||||||
|
format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
|
||||||
|
let mut writer = client.copy_in(&import_cmd)?;
|
||||||
|
io::copy(&mut base_reader, &mut writer)?;
|
||||||
|
writer.finish()?;
|
||||||
|
|
||||||
|
// Import wal if necessary
|
||||||
|
if let Some(mut wal_reader) = wal_reader {
|
||||||
|
let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
|
||||||
|
let mut writer = client.copy_in(&import_cmd)?;
|
||||||
|
io::copy(&mut wal_reader, &mut writer)?;
|
||||||
|
writer.finish()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,12 +36,12 @@ This is how the `LOGICAL_TIMELINE_SIZE` metric is implemented in the pageserver.
|
|||||||
Alternatively, we could count only relation data. As in pg_database_size().
|
Alternatively, we could count only relation data. As in pg_database_size().
|
||||||
This approach is somewhat more user-friendly because it is the data that is really affected by the user.
|
This approach is somewhat more user-friendly because it is the data that is really affected by the user.
|
||||||
On the other hand, it puts us in a weaker position than other services, i.e., RDS.
|
On the other hand, it puts us in a weaker position than other services, i.e., RDS.
|
||||||
We will need to refactor the timeline_size counter or add another counter to implement it.
|
We will need to refactor the timeline_size counter or add another counter to implement it.
|
||||||
|
|
||||||
Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment.
|
Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment.
|
||||||
Then this size should be reported to compute node.
|
Then this size should be reported to compute node.
|
||||||
|
|
||||||
`current_timeline_size` value is included in the walreceiver's custom feedback message: `ZenithFeedback.`
|
`current_timeline_size` value is included in the walreceiver's custom feedback message: `ReplicationFeedback.`
|
||||||
|
|
||||||
(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037).
|
(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037).
|
||||||
|
|
||||||
@@ -64,11 +64,11 @@ We should warn users if the limit is soon to be reached.
|
|||||||
### **Reliability, failure modes and corner cases**
|
### **Reliability, failure modes and corner cases**
|
||||||
|
|
||||||
1. `current_timeline_size` is valid at the last received and digested by pageserver lsn.
|
1. `current_timeline_size` is valid at the last received and digested by pageserver lsn.
|
||||||
|
|
||||||
If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time.
|
If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time.
|
||||||
|
|
||||||
So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this?
|
So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this?
|
||||||
|
|
||||||
Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue.
|
Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -154,7 +154,7 @@ The default distrib dir is `./tmp_install/`.
|
|||||||
#### workdir (-D)
|
#### workdir (-D)
|
||||||
|
|
||||||
A directory in the file system, where pageserver will store its files.
|
A directory in the file system, where pageserver will store its files.
|
||||||
The default is `./.zenith/`.
|
The default is `./.neon/`.
|
||||||
|
|
||||||
This parameter has a special CLI alias (`-D`) and can not be overridden with regular `-c` way.
|
This parameter has a special CLI alias (`-D`) and can not be overridden with regular `-c` way.
|
||||||
|
|
||||||
|
|||||||
@@ -1,62 +1,81 @@
|
|||||||
//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
|
//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
|
||||||
//! Intended to connect services to each other, not to store their data.
|
//! Intended to connect services to each other, not to store their data.
|
||||||
use std::{
|
|
||||||
collections::{hash_map, HashMap},
|
|
||||||
fmt::Display,
|
|
||||||
str::FromStr,
|
|
||||||
};
|
|
||||||
|
|
||||||
use once_cell::sync::Lazy;
|
/// All broker keys, that are used when dealing with etcd.
|
||||||
use regex::{Captures, Regex};
|
pub mod subscription_key;
|
||||||
use serde::{Deserialize, Serialize};
|
/// All broker values, possible to use when dealing with etcd.
|
||||||
use serde_with::{serde_as, DisplayFromStr};
|
pub mod subscription_value;
|
||||||
|
|
||||||
pub use etcd_client::*;
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use serde::de::DeserializeOwned;
|
||||||
|
|
||||||
|
use subscription_key::SubscriptionKey;
|
||||||
use tokio::{sync::mpsc, task::JoinHandle};
|
use tokio::{sync::mpsc, task::JoinHandle};
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::{
|
|
||||||
lsn::Lsn,
|
use crate::subscription_key::SubscriptionFullKey;
|
||||||
zid::{NodeId, ZTenantId, ZTenantTimelineId},
|
|
||||||
};
|
pub use etcd_client::*;
|
||||||
|
|
||||||
/// Default value to use for prefixing to all etcd keys with.
|
/// Default value to use for prefixing to all etcd keys with.
|
||||||
/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster.
|
/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster.
|
||||||
pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";
|
pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Serialize)]
|
/// A way to control the data retrieval from a certain subscription.
|
||||||
struct SafekeeperTimeline {
|
pub struct BrokerSubscription<V> {
|
||||||
safekeeper_id: NodeId,
|
/// An unbounded channel to fetch the relevant etcd updates from.
|
||||||
info: SkTimelineInfo,
|
pub value_updates: mpsc::UnboundedReceiver<BrokerUpdate<V>>,
|
||||||
|
key: SubscriptionKey,
|
||||||
|
/// A subscription task handle, to allow waiting on it for the task to complete.
|
||||||
|
/// Both the updates channel and the handle require `&mut`, so it's better to keep
|
||||||
|
/// both `pub` to allow using both in the same structures without borrow checker complaining.
|
||||||
|
pub watcher_handle: JoinHandle<Result<(), BrokerError>>,
|
||||||
|
watcher: Watcher,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Published data about safekeeper's timeline. Fields made optional for easy migrations.
|
impl<V> BrokerSubscription<V> {
|
||||||
#[serde_as]
|
/// Cancels the subscription, stopping the data poller and waiting for it to shut down.
|
||||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
pub async fn cancel(mut self) -> Result<(), BrokerError> {
|
||||||
pub struct SkTimelineInfo {
|
self.watcher.cancel().await.map_err(|e| {
|
||||||
/// Term of the last entry.
|
BrokerError::EtcdClient(
|
||||||
pub last_log_term: Option<u64>,
|
e,
|
||||||
/// LSN of the last record.
|
format!("Failed to cancel broker subscription, kind: {:?}", self.key),
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
)
|
||||||
#[serde(default)]
|
})?;
|
||||||
pub flush_lsn: Option<Lsn>,
|
match (&mut self.watcher_handle).await {
|
||||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
Ok(res) => res,
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
Err(e) => {
|
||||||
#[serde(default)]
|
if e.is_cancelled() {
|
||||||
pub commit_lsn: Option<Lsn>,
|
// don't error on the tasks that are cancelled already
|
||||||
/// LSN up to which safekeeper has backed WAL.
|
Ok(())
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
} else {
|
||||||
#[serde(default)]
|
Err(BrokerError::InternalError(format!(
|
||||||
pub backup_lsn: Option<Lsn>,
|
"Panicked during broker subscription task, kind: {:?}, error: {e}",
|
||||||
/// LSN of last checkpoint uploaded by pageserver.
|
self.key
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
)))
|
||||||
#[serde(default)]
|
}
|
||||||
pub remote_consistent_lsn: Option<Lsn>,
|
}
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
}
|
||||||
#[serde(default)]
|
}
|
||||||
pub peer_horizon_lsn: Option<Lsn>,
|
}
|
||||||
#[serde(default)]
|
|
||||||
pub safekeeper_connstr: Option<String>,
|
impl<V> Drop for BrokerSubscription<V> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
// we poll data from etcd into the channel in the same struct, so if the whole struct gets dropped,
|
||||||
|
// no more data is used by the receiver and it's safe to cancel and drop the whole etcd subscription task.
|
||||||
|
self.watcher_handle.abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An update from the etcd broker.
|
||||||
|
pub struct BrokerUpdate<V> {
|
||||||
|
/// Etcd generation version, the bigger the more actual the data is.
|
||||||
|
pub etcd_version: i64,
|
||||||
|
/// Etcd key for the corresponding value, parsed from the broker KV.
|
||||||
|
pub key: SubscriptionFullKey,
|
||||||
|
/// Current etcd value, parsed from the broker KV.
|
||||||
|
pub value: V,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
@@ -64,331 +83,127 @@ pub enum BrokerError {
|
|||||||
#[error("Etcd client error: {0}. Context: {1}")]
|
#[error("Etcd client error: {0}. Context: {1}")]
|
||||||
EtcdClient(etcd_client::Error, String),
|
EtcdClient(etcd_client::Error, String),
|
||||||
#[error("Error during parsing etcd key: {0}")]
|
#[error("Error during parsing etcd key: {0}")]
|
||||||
InvalidKey(String),
|
KeyNotParsed(String),
|
||||||
#[error("Error during parsing etcd value: {0}")]
|
|
||||||
ParsingError(String),
|
|
||||||
#[error("Internal error: {0}")]
|
#[error("Internal error: {0}")]
|
||||||
InternalError(String),
|
InternalError(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A way to control the data retrieval from a certain subscription.
|
|
||||||
pub struct SkTimelineSubscription {
|
|
||||||
safekeeper_timeline_updates:
|
|
||||||
mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>>,
|
|
||||||
kind: SkTimelineSubscriptionKind,
|
|
||||||
watcher_handle: JoinHandle<Result<(), BrokerError>>,
|
|
||||||
watcher: Watcher,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SkTimelineSubscription {
|
|
||||||
/// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet.
|
|
||||||
pub async fn fetch_data(
|
|
||||||
&mut self,
|
|
||||||
) -> Option<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>> {
|
|
||||||
self.safekeeper_timeline_updates.recv().await
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Cancels the subscription, stopping the data poller and waiting for it to shut down.
|
|
||||||
pub async fn cancel(mut self) -> Result<(), BrokerError> {
|
|
||||||
self.watcher.cancel().await.map_err(|e| {
|
|
||||||
BrokerError::EtcdClient(
|
|
||||||
e,
|
|
||||||
format!(
|
|
||||||
"Failed to cancel timeline subscription, kind: {:?}",
|
|
||||||
self.kind
|
|
||||||
),
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
self.watcher_handle.await.map_err(|e| {
|
|
||||||
BrokerError::InternalError(format!(
|
|
||||||
"Failed to join the timeline updates task, kind: {:?}, error: {e}",
|
|
||||||
self.kind
|
|
||||||
))
|
|
||||||
})?
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The subscription kind to the timeline updates from safekeeper.
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
|
||||||
pub struct SkTimelineSubscriptionKind {
|
|
||||||
broker_etcd_prefix: String,
|
|
||||||
kind: SubscriptionKind,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SkTimelineSubscriptionKind {
|
|
||||||
pub fn all(broker_etcd_prefix: String) -> Self {
|
|
||||||
Self {
|
|
||||||
broker_etcd_prefix,
|
|
||||||
kind: SubscriptionKind::All,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn tenant(broker_etcd_prefix: String, tenant: ZTenantId) -> Self {
|
|
||||||
Self {
|
|
||||||
broker_etcd_prefix,
|
|
||||||
kind: SubscriptionKind::Tenant(tenant),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn timeline(broker_etcd_prefix: String, timeline: ZTenantTimelineId) -> Self {
|
|
||||||
Self {
|
|
||||||
broker_etcd_prefix,
|
|
||||||
kind: SubscriptionKind::Timeline(timeline),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Etcd key to use for watching a certain timeline updates from safekeepers.
|
|
||||||
pub fn watch_key(&self) -> String {
|
|
||||||
match self.kind {
|
|
||||||
SubscriptionKind::All => self.broker_etcd_prefix.to_string(),
|
|
||||||
SubscriptionKind::Tenant(tenant_id) => {
|
|
||||||
format!("{}/{tenant_id}/safekeeper", self.broker_etcd_prefix)
|
|
||||||
}
|
|
||||||
SubscriptionKind::Timeline(ZTenantTimelineId {
|
|
||||||
tenant_id,
|
|
||||||
timeline_id,
|
|
||||||
}) => format!(
|
|
||||||
"{}/{tenant_id}/{timeline_id}/safekeeper",
|
|
||||||
self.broker_etcd_prefix
|
|
||||||
),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
|
||||||
enum SubscriptionKind {
|
|
||||||
/// Get every timeline update.
|
|
||||||
All,
|
|
||||||
/// Get certain tenant timelines' updates.
|
|
||||||
Tenant(ZTenantId),
|
|
||||||
/// Get certain timeline updates.
|
|
||||||
Timeline(ZTenantTimelineId),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a background task to poll etcd for timeline updates from safekeepers.
|
/// Creates a background task to poll etcd for timeline updates from safekeepers.
|
||||||
/// Stops and returns `Err` on any error during etcd communication.
|
/// Stops and returns `Err` on any error during etcd communication.
|
||||||
/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
|
/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
|
||||||
/// exiting normally in such cases.
|
/// exiting normally in such cases.
|
||||||
pub async fn subscribe_to_safekeeper_timeline_updates(
|
/// Etcd values are parsed as json fukes into a type, specified in the generic patameter.
|
||||||
|
pub async fn subscribe_for_json_values<V>(
|
||||||
client: &mut Client,
|
client: &mut Client,
|
||||||
subscription: SkTimelineSubscriptionKind,
|
key: SubscriptionKey,
|
||||||
) -> Result<SkTimelineSubscription, BrokerError> {
|
) -> Result<BrokerSubscription<V>, BrokerError>
|
||||||
info!("Subscribing to timeline updates, subscription kind: {subscription:?}");
|
where
|
||||||
let kind = subscription.clone();
|
V: DeserializeOwned + Send + 'static,
|
||||||
|
{
|
||||||
|
subscribe_for_values(client, key, |_, value_str| {
|
||||||
|
match serde_json::from_str::<V>(value_str) {
|
||||||
|
Ok(value) => Some(value),
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to parse value str '{value_str}': {e}");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as [`subscribe_for_json_values`], but allows to specify a custom parser of a etcd value string.
|
||||||
|
pub async fn subscribe_for_values<P, V>(
|
||||||
|
client: &mut Client,
|
||||||
|
key: SubscriptionKey,
|
||||||
|
value_parser: P,
|
||||||
|
) -> Result<BrokerSubscription<V>, BrokerError>
|
||||||
|
where
|
||||||
|
V: Send + 'static,
|
||||||
|
P: Fn(SubscriptionFullKey, &str) -> Option<V> + Send + 'static,
|
||||||
|
{
|
||||||
|
info!("Subscribing to broker value updates, key: {key:?}");
|
||||||
|
let subscription_key = key.clone();
|
||||||
|
|
||||||
let (watcher, mut stream) = client
|
let (watcher, mut stream) = client
|
||||||
.watch(
|
.watch(key.watch_key(), Some(WatchOptions::new().with_prefix()))
|
||||||
subscription.watch_key(),
|
|
||||||
Some(WatchOptions::new().with_prefix()),
|
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
.map_err(|e| {
|
.map_err(|e| {
|
||||||
BrokerError::EtcdClient(
|
BrokerError::EtcdClient(
|
||||||
e,
|
e,
|
||||||
format!("Failed to init the watch for subscription {subscription:?}"),
|
format!("Failed to init the watch for subscription {key:?}"),
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel();
|
let (value_updates_sender, value_updates_receiver) = mpsc::unbounded_channel();
|
||||||
let watcher_handle = tokio::spawn(async move {
|
let watcher_handle = tokio::spawn(async move {
|
||||||
while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
|
while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
|
||||||
"Failed to get messages from the subscription stream, kind: {:?}, error: {e}", subscription.kind
|
"Failed to get messages from the subscription stream, kind: {:?}, error: {e}", key.kind
|
||||||
)))? {
|
)))? {
|
||||||
if resp.canceled() {
|
if resp.canceled() {
|
||||||
info!("Watch for timeline updates subscription was canceled, exiting");
|
info!("Watch for timeline updates subscription was canceled, exiting");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>> = HashMap::new();
|
|
||||||
// Keep track that the timeline data updates from etcd arrive in the right order.
|
|
||||||
// https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas
|
|
||||||
// > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering.
|
|
||||||
let mut timeline_etcd_versions: HashMap<ZTenantTimelineId, i64> = HashMap::new();
|
|
||||||
|
|
||||||
|
|
||||||
let events = resp.events();
|
let events = resp.events();
|
||||||
debug!("Processing {} events", events.len());
|
debug!("Processing {} events", events.len());
|
||||||
|
|
||||||
for event in events {
|
for event in events {
|
||||||
if EventType::Put == event.event_type() {
|
if EventType::Put == event.event_type() {
|
||||||
if let Some(new_etcd_kv) = event.kv() {
|
if let Some(new_etcd_kv) = event.kv() {
|
||||||
let new_kv_version = new_etcd_kv.version();
|
match parse_etcd_kv(new_etcd_kv, &value_parser, &key.cluster_prefix) {
|
||||||
let (key_str, value_str) = match extract_key_value_str(new_etcd_kv) {
|
Ok(Some((key, value))) => if let Err(e) = value_updates_sender.send(BrokerUpdate {
|
||||||
Ok(strs) => strs,
|
etcd_version: new_etcd_kv.version(),
|
||||||
Err(e) => {
|
key,
|
||||||
error!("Failed to represent etcd KV {new_etcd_kv:?} as pair of str: {e}");
|
value,
|
||||||
continue;
|
}) {
|
||||||
|
info!("Broker value updates for key {key:?} sender got dropped, exiting: {e}");
|
||||||
|
break;
|
||||||
},
|
},
|
||||||
};
|
Ok(None) => debug!("Ignoring key {key:?} : no value was returned by the parser"),
|
||||||
|
Err(BrokerError::KeyNotParsed(e)) => debug!("Unexpected key {key:?} for timeline update: {e}"),
|
||||||
match parse_safekeeper_timeline(&subscription, key_str, value_str) {
|
Err(e) => error!("Failed to represent etcd KV {new_etcd_kv:?}: {e}"),
|
||||||
Ok((zttid, timeline)) => {
|
|
||||||
match timeline_updates
|
|
||||||
.entry(zttid)
|
|
||||||
.or_default()
|
|
||||||
.entry(timeline.safekeeper_id)
|
|
||||||
{
|
|
||||||
hash_map::Entry::Occupied(mut o) => {
|
|
||||||
let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN);
|
|
||||||
if old_etcd_kv_version < new_kv_version {
|
|
||||||
o.insert(timeline.info);
|
|
||||||
timeline_etcd_versions.insert(zttid,new_kv_version);
|
|
||||||
} else {
|
|
||||||
debug!("Skipping etcd timeline update due to older version compared to one that's already stored");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
hash_map::Entry::Vacant(v) => {
|
|
||||||
v.insert(timeline.info);
|
|
||||||
timeline_etcd_versions.insert(zttid,new_kv_version);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// it is normal to get other keys when we subscribe to everything
|
|
||||||
Err(BrokerError::InvalidKey(e)) => debug!("Unexpected key for timeline update: {e}"),
|
|
||||||
Err(e) => error!("Failed to parse timeline update: {e}"),
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Err(e) = timeline_updates_sender.send(timeline_updates) {
|
|
||||||
info!("Timeline updates sender got dropped, exiting: {e}");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}.instrument(info_span!("etcd_broker")));
|
}.instrument(info_span!("etcd_broker")));
|
||||||
|
|
||||||
Ok(SkTimelineSubscription {
|
Ok(BrokerSubscription {
|
||||||
kind,
|
key: subscription_key,
|
||||||
safekeeper_timeline_updates,
|
value_updates: value_updates_receiver,
|
||||||
watcher_handle,
|
watcher_handle,
|
||||||
watcher,
|
watcher,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_key_value_str(kv: &KeyValue) -> Result<(&str, &str), BrokerError> {
|
fn parse_etcd_kv<P, V>(
|
||||||
let key = kv.key_str().map_err(|e| {
|
kv: &KeyValue,
|
||||||
|
value_parser: &P,
|
||||||
|
cluster_prefix: &str,
|
||||||
|
) -> Result<Option<(SubscriptionFullKey, V)>, BrokerError>
|
||||||
|
where
|
||||||
|
P: Fn(SubscriptionFullKey, &str) -> Option<V>,
|
||||||
|
{
|
||||||
|
let key_str = kv.key_str().map_err(|e| {
|
||||||
BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string())
|
BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string())
|
||||||
})?;
|
})?;
|
||||||
let value = kv.value_str().map_err(|e| {
|
let value_str = kv.value_str().map_err(|e| {
|
||||||
BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string())
|
BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string())
|
||||||
})?;
|
})?;
|
||||||
Ok((key, value))
|
|
||||||
}
|
|
||||||
|
|
||||||
static SK_TIMELINE_KEY_REGEX: Lazy<Regex> = Lazy::new(|| {
|
if !key_str.starts_with(cluster_prefix) {
|
||||||
Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]]+)$")
|
return Err(BrokerError::KeyNotParsed(format!(
|
||||||
.expect("wrong regex for safekeeper timeline etcd key")
|
"KV has unexpected key '{key_str}' that does not start with cluster prefix {cluster_prefix}"
|
||||||
});
|
|
||||||
|
|
||||||
fn parse_safekeeper_timeline(
|
|
||||||
subscription: &SkTimelineSubscriptionKind,
|
|
||||||
key_str: &str,
|
|
||||||
value_str: &str,
|
|
||||||
) -> Result<(ZTenantTimelineId, SafekeeperTimeline), BrokerError> {
|
|
||||||
let broker_prefix = subscription.broker_etcd_prefix.as_str();
|
|
||||||
if !key_str.starts_with(broker_prefix) {
|
|
||||||
return Err(BrokerError::InvalidKey(format!(
|
|
||||||
"KV has unexpected key '{key_str}' that does not start with broker prefix {broker_prefix}"
|
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
let key_part = &key_str[broker_prefix.len()..];
|
let key = SubscriptionFullKey::from_str(&key_str[cluster_prefix.len()..]).map_err(|e| {
|
||||||
let key_captures = match SK_TIMELINE_KEY_REGEX.captures(key_part) {
|
BrokerError::KeyNotParsed(format!("Failed to parse KV key '{key_str}': {e}"))
|
||||||
Some(captures) => captures,
|
|
||||||
None => {
|
|
||||||
return Err(BrokerError::InvalidKey(format!(
|
|
||||||
"KV has unexpected key part '{key_part}' that does not match required regex {}",
|
|
||||||
SK_TIMELINE_KEY_REGEX.as_str()
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let info = serde_json::from_str(value_str).map_err(|e| {
|
|
||||||
BrokerError::ParsingError(format!(
|
|
||||||
"Failed to parse '{value_str}' as safekeeper timeline info: {e}"
|
|
||||||
))
|
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let zttid = ZTenantTimelineId::new(
|
Ok(value_parser(key, value_str).map(|value| (key, value)))
|
||||||
parse_capture(&key_captures, 1).map_err(BrokerError::ParsingError)?,
|
|
||||||
parse_capture(&key_captures, 2).map_err(BrokerError::ParsingError)?,
|
|
||||||
);
|
|
||||||
let safekeeper_id = NodeId(parse_capture(&key_captures, 3).map_err(BrokerError::ParsingError)?);
|
|
||||||
|
|
||||||
Ok((
|
|
||||||
zttid,
|
|
||||||
SafekeeperTimeline {
|
|
||||||
safekeeper_id,
|
|
||||||
info,
|
|
||||||
},
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
|
|
||||||
where
|
|
||||||
T: FromStr,
|
|
||||||
<T as FromStr>::Err: Display,
|
|
||||||
{
|
|
||||||
let capture_match = caps
|
|
||||||
.get(index)
|
|
||||||
.ok_or_else(|| format!("Failed to get capture match at index {index}"))?
|
|
||||||
.as_str();
|
|
||||||
capture_match.parse().map_err(|e| {
|
|
||||||
format!(
|
|
||||||
"Failed to parse {} from {capture_match}: {e}",
|
|
||||||
std::any::type_name::<T>()
|
|
||||||
)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use utils::zid::ZTimelineId;
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn typical_etcd_prefix_should_be_parsed() {
|
|
||||||
let prefix = "neon";
|
|
||||||
let tenant_id = ZTenantId::generate();
|
|
||||||
let timeline_id = ZTimelineId::generate();
|
|
||||||
let all_subscription = SkTimelineSubscriptionKind {
|
|
||||||
broker_etcd_prefix: prefix.to_string(),
|
|
||||||
kind: SubscriptionKind::All,
|
|
||||||
};
|
|
||||||
let tenant_subscription = SkTimelineSubscriptionKind {
|
|
||||||
broker_etcd_prefix: prefix.to_string(),
|
|
||||||
kind: SubscriptionKind::Tenant(tenant_id),
|
|
||||||
};
|
|
||||||
let timeline_subscription = SkTimelineSubscriptionKind {
|
|
||||||
broker_etcd_prefix: prefix.to_string(),
|
|
||||||
kind: SubscriptionKind::Timeline(ZTenantTimelineId::new(tenant_id, timeline_id)),
|
|
||||||
};
|
|
||||||
|
|
||||||
let typical_etcd_kv_strs = [
|
|
||||||
(
|
|
||||||
format!("{prefix}/{tenant_id}/{timeline_id}/safekeeper/1"),
|
|
||||||
r#"{"last_log_term":231,"flush_lsn":"0/241BB70","commit_lsn":"0/241BB70","backup_lsn":"0/2000000","remote_consistent_lsn":"0/0","peer_horizon_lsn":"0/16960E8","safekeeper_connstr":"something.local:1234","pageserver_connstr":"postgresql://(null):@somethine.else.local:3456"}"#,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
format!("{prefix}/{tenant_id}/{timeline_id}/safekeeper/13"),
|
|
||||||
r#"{"last_log_term":231,"flush_lsn":"0/241BB70","commit_lsn":"0/241BB70","backup_lsn":"0/2000000","remote_consistent_lsn":"0/0","peer_horizon_lsn":"0/16960E8","safekeeper_connstr":"something.local:1234","pageserver_connstr":"postgresql://(null):@somethine.else.local:3456"}"#,
|
|
||||||
),
|
|
||||||
];
|
|
||||||
|
|
||||||
for (key_string, value_str) in typical_etcd_kv_strs {
|
|
||||||
for subscription in [
|
|
||||||
&all_subscription,
|
|
||||||
&tenant_subscription,
|
|
||||||
&timeline_subscription,
|
|
||||||
] {
|
|
||||||
let (id, _timeline) =
|
|
||||||
parse_safekeeper_timeline(subscription, &key_string, value_str)
|
|
||||||
.unwrap_or_else(|e| panic!("Should be able to parse etcd key string '{key_string}' and etcd value string '{value_str}' for subscription {subscription:?}, but got: {e}"));
|
|
||||||
assert_eq!(id, ZTenantTimelineId::new(tenant_id, timeline_id));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
310
libs/etcd_broker/src/subscription_key.rs
Normal file
310
libs/etcd_broker/src/subscription_key.rs
Normal file
@@ -0,0 +1,310 @@
|
|||||||
|
//! Etcd broker keys, used in the project and shared between instances.
|
||||||
|
//! The keys are split into two categories:
|
||||||
|
//!
|
||||||
|
//! * [`SubscriptionFullKey`] full key format: `<cluster_prefix>/<tenant>/<timeline>/<node_kind>/<operation>/<node_id>`
|
||||||
|
//! Always returned from etcd in this form, always start with the user key provided.
|
||||||
|
//!
|
||||||
|
//! * [`SubscriptionKey`] user input key format: always partial, since it's unknown which `node_id`'s are available.
|
||||||
|
//! Full key always starts with the user input one, due to etcd subscription properties.
|
||||||
|
|
||||||
|
use std::{fmt::Display, str::FromStr};
|
||||||
|
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use regex::{Captures, Regex};
|
||||||
|
use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId};
|
||||||
|
|
||||||
|
/// The subscription kind to the timeline updates from safekeeper.
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub struct SubscriptionKey {
|
||||||
|
/// Generic cluster prefix, allowing to use the same etcd instance by multiple logic groups.
|
||||||
|
pub cluster_prefix: String,
|
||||||
|
/// The subscription kind.
|
||||||
|
pub kind: SubscriptionKind,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// All currently possible key kinds of a etcd broker subscription.
|
||||||
|
/// Etcd works so, that every key that starts with the subbscription key given is considered matching and
|
||||||
|
/// returned as part of the subscrption.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum SubscriptionKind {
|
||||||
|
/// Get every update in etcd.
|
||||||
|
All,
|
||||||
|
/// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind.
|
||||||
|
TenantTimelines(ZTenantId),
|
||||||
|
/// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind.
|
||||||
|
Timeline(ZTenantTimelineId),
|
||||||
|
/// Get etcd timeline updates, specific to a certain node kind.
|
||||||
|
Node(ZTenantTimelineId, NodeKind),
|
||||||
|
/// Get etcd timeline updates for a certain operation on specific nodes.
|
||||||
|
Operation(ZTenantTimelineId, NodeKind, OperationKind),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// All kinds of nodes, able to write into etcd.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum NodeKind {
|
||||||
|
Safekeeper,
|
||||||
|
Pageserver,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum OperationKind {
|
||||||
|
Safekeeper(SkOperationKind),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Current operations, running inside the safekeeper node.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum SkOperationKind {
|
||||||
|
TimelineInfo,
|
||||||
|
WalBackup,
|
||||||
|
}
|
||||||
|
|
||||||
|
static SUBSCRIPTION_FULL_KEY_REGEX: Lazy<Regex> = Lazy::new(|| {
|
||||||
|
Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/([^/]+)/([^/]+)/([[:digit:]]+)$")
|
||||||
|
.expect("wrong subscription full etcd key regex")
|
||||||
|
});
|
||||||
|
|
||||||
|
/// Full key, received from etcd during any of the component's work.
|
||||||
|
/// No other etcd keys are considered during system's work.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub struct SubscriptionFullKey {
|
||||||
|
pub id: ZTenantTimelineId,
|
||||||
|
pub node_kind: NodeKind,
|
||||||
|
pub operation: OperationKind,
|
||||||
|
pub node_id: NodeId,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SubscriptionKey {
|
||||||
|
/// Subscribes for all etcd updates.
|
||||||
|
pub fn all(cluster_prefix: String) -> Self {
|
||||||
|
SubscriptionKey {
|
||||||
|
cluster_prefix,
|
||||||
|
kind: SubscriptionKind::All,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Subscribes to a given timeline info updates from safekeepers.
|
||||||
|
pub fn sk_timeline_info(cluster_prefix: String, timeline: ZTenantTimelineId) -> Self {
|
||||||
|
Self {
|
||||||
|
cluster_prefix,
|
||||||
|
kind: SubscriptionKind::Operation(
|
||||||
|
timeline,
|
||||||
|
NodeKind::Safekeeper,
|
||||||
|
OperationKind::Safekeeper(SkOperationKind::TimelineInfo),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Subscribes to all timeine updates during specific operations, running on the corresponding nodes.
|
||||||
|
pub fn operation(
|
||||||
|
cluster_prefix: String,
|
||||||
|
timeline: ZTenantTimelineId,
|
||||||
|
node_kind: NodeKind,
|
||||||
|
operation: OperationKind,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
cluster_prefix,
|
||||||
|
kind: SubscriptionKind::Operation(timeline, node_kind, operation),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Etcd key to use for watching a certain timeline updates from safekeepers.
|
||||||
|
pub fn watch_key(&self) -> String {
|
||||||
|
let cluster_prefix = &self.cluster_prefix;
|
||||||
|
match self.kind {
|
||||||
|
SubscriptionKind::All => cluster_prefix.to_string(),
|
||||||
|
SubscriptionKind::TenantTimelines(tenant_id) => {
|
||||||
|
format!("{cluster_prefix}/{tenant_id}")
|
||||||
|
}
|
||||||
|
SubscriptionKind::Timeline(id) => {
|
||||||
|
format!("{cluster_prefix}/{id}")
|
||||||
|
}
|
||||||
|
SubscriptionKind::Node(id, node_kind) => {
|
||||||
|
format!("{cluster_prefix}/{id}/{node_kind}")
|
||||||
|
}
|
||||||
|
SubscriptionKind::Operation(id, node_kind, operation_kind) => {
|
||||||
|
format!("{cluster_prefix}/{id}/{node_kind}/{operation_kind}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for OperationKind {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
OperationKind::Safekeeper(o) => o.fmt(f),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for OperationKind {
|
||||||
|
type Err = String;
|
||||||
|
|
||||||
|
fn from_str(operation_kind_str: &str) -> Result<Self, Self::Err> {
|
||||||
|
match operation_kind_str {
|
||||||
|
"timeline_info" => Ok(OperationKind::Safekeeper(SkOperationKind::TimelineInfo)),
|
||||||
|
"wal_backup" => Ok(OperationKind::Safekeeper(SkOperationKind::WalBackup)),
|
||||||
|
_ => Err(format!("Unknown operation kind: {operation_kind_str}")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for SubscriptionFullKey {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
let Self {
|
||||||
|
id,
|
||||||
|
node_kind,
|
||||||
|
operation,
|
||||||
|
node_id,
|
||||||
|
} = self;
|
||||||
|
write!(f, "{id}/{node_kind}/{operation}/{node_id}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for SubscriptionFullKey {
|
||||||
|
type Err = String;
|
||||||
|
|
||||||
|
fn from_str(subscription_kind_str: &str) -> Result<Self, Self::Err> {
|
||||||
|
let key_captures = match SUBSCRIPTION_FULL_KEY_REGEX.captures(subscription_kind_str) {
|
||||||
|
Some(captures) => captures,
|
||||||
|
None => {
|
||||||
|
return Err(format!(
|
||||||
|
"Subscription kind str does not match a subscription full key regex {}",
|
||||||
|
SUBSCRIPTION_FULL_KEY_REGEX.as_str()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
id: ZTenantTimelineId::new(
|
||||||
|
parse_capture(&key_captures, 1)?,
|
||||||
|
parse_capture(&key_captures, 2)?,
|
||||||
|
),
|
||||||
|
node_kind: parse_capture(&key_captures, 3)?,
|
||||||
|
operation: parse_capture(&key_captures, 4)?,
|
||||||
|
node_id: NodeId(parse_capture(&key_captures, 5)?),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
|
||||||
|
where
|
||||||
|
T: FromStr,
|
||||||
|
<T as FromStr>::Err: Display,
|
||||||
|
{
|
||||||
|
let capture_match = caps
|
||||||
|
.get(index)
|
||||||
|
.ok_or_else(|| format!("Failed to get capture match at index {index}"))?
|
||||||
|
.as_str();
|
||||||
|
capture_match.parse().map_err(|e| {
|
||||||
|
format!(
|
||||||
|
"Failed to parse {} from {capture_match}: {e}",
|
||||||
|
std::any::type_name::<T>()
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for NodeKind {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
Self::Safekeeper => write!(f, "safekeeper"),
|
||||||
|
Self::Pageserver => write!(f, "pageserver"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for NodeKind {
|
||||||
|
type Err = String;
|
||||||
|
|
||||||
|
fn from_str(node_kind_str: &str) -> Result<Self, Self::Err> {
|
||||||
|
match node_kind_str {
|
||||||
|
"safekeeper" => Ok(Self::Safekeeper),
|
||||||
|
"pageserver" => Ok(Self::Pageserver),
|
||||||
|
_ => Err(format!("Invalid node kind: {node_kind_str}")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for SkOperationKind {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
Self::TimelineInfo => write!(f, "timeline_info"),
|
||||||
|
Self::WalBackup => write!(f, "wal_backup"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for SkOperationKind {
|
||||||
|
type Err = String;
|
||||||
|
|
||||||
|
fn from_str(operation_str: &str) -> Result<Self, Self::Err> {
|
||||||
|
match operation_str {
|
||||||
|
"timeline_info" => Ok(Self::TimelineInfo),
|
||||||
|
"wal_backup" => Ok(Self::WalBackup),
|
||||||
|
_ => Err(format!("Invalid operation: {operation_str}")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use utils::zid::ZTimelineId;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn full_cluster_key_parsing() {
|
||||||
|
let prefix = "neon";
|
||||||
|
let node_kind = NodeKind::Safekeeper;
|
||||||
|
let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup);
|
||||||
|
let tenant_id = ZTenantId::generate();
|
||||||
|
let timeline_id = ZTimelineId::generate();
|
||||||
|
let id = ZTenantTimelineId::new(tenant_id, timeline_id);
|
||||||
|
let node_id = NodeId(1);
|
||||||
|
|
||||||
|
let timeline_subscription_keys = [
|
||||||
|
SubscriptionKey {
|
||||||
|
cluster_prefix: prefix.to_string(),
|
||||||
|
kind: SubscriptionKind::All,
|
||||||
|
},
|
||||||
|
SubscriptionKey {
|
||||||
|
cluster_prefix: prefix.to_string(),
|
||||||
|
kind: SubscriptionKind::TenantTimelines(tenant_id),
|
||||||
|
},
|
||||||
|
SubscriptionKey {
|
||||||
|
cluster_prefix: prefix.to_string(),
|
||||||
|
kind: SubscriptionKind::Timeline(id),
|
||||||
|
},
|
||||||
|
SubscriptionKey {
|
||||||
|
cluster_prefix: prefix.to_string(),
|
||||||
|
kind: SubscriptionKind::Node(id, node_kind),
|
||||||
|
},
|
||||||
|
SubscriptionKey {
|
||||||
|
cluster_prefix: prefix.to_string(),
|
||||||
|
kind: SubscriptionKind::Operation(id, node_kind, operation_kind),
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
let full_key_string = format!(
|
||||||
|
"{}/{node_id}",
|
||||||
|
timeline_subscription_keys.last().unwrap().watch_key()
|
||||||
|
);
|
||||||
|
|
||||||
|
for key in timeline_subscription_keys {
|
||||||
|
assert!(full_key_string.starts_with(&key.watch_key()), "Full key '{full_key_string}' should start with any of the keys, keys, but {key:?} did not match");
|
||||||
|
}
|
||||||
|
|
||||||
|
let full_key = SubscriptionFullKey::from_str(&full_key_string).unwrap_or_else(|e| {
|
||||||
|
panic!("Failed to parse {full_key_string} as a subscription full key: {e}")
|
||||||
|
});
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
full_key,
|
||||||
|
SubscriptionFullKey {
|
||||||
|
id,
|
||||||
|
node_kind,
|
||||||
|
operation: operation_kind,
|
||||||
|
node_id
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
35
libs/etcd_broker/src/subscription_value.rs
Normal file
35
libs/etcd_broker/src/subscription_value.rs
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
//! Module for the values to put into etcd.
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_with::{serde_as, DisplayFromStr};
|
||||||
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
|
/// Data about safekeeper's timeline. Fields made optional for easy migrations.
|
||||||
|
#[serde_as]
|
||||||
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||||
|
pub struct SkTimelineInfo {
|
||||||
|
/// Term of the last entry.
|
||||||
|
pub last_log_term: Option<u64>,
|
||||||
|
/// LSN of the last record.
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub flush_lsn: Option<Lsn>,
|
||||||
|
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub commit_lsn: Option<Lsn>,
|
||||||
|
/// LSN up to which safekeeper has backed WAL.
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub backup_lsn: Option<Lsn>,
|
||||||
|
/// LSN of last checkpoint uploaded by pageserver.
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub remote_consistent_lsn: Option<Lsn>,
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub peer_horizon_lsn: Option<Lsn>,
|
||||||
|
/// A connection string to use for WAL receiving.
|
||||||
|
#[serde(default)]
|
||||||
|
pub safekeeper_connstr: Option<String>,
|
||||||
|
}
|
||||||
@@ -23,7 +23,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
|||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
env_logger = "0.9"
|
env_logger = "0.9"
|
||||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||||
wal_generate = { path = "wal_generate" }
|
wal_craft = { path = "wal_craft" }
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
bindgen = "0.59.1"
|
bindgen = "0.59.1"
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ extern crate bindgen;
|
|||||||
|
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
use std::process::Command;
|
||||||
|
|
||||||
use bindgen::callbacks::ParseCallbacks;
|
use bindgen::callbacks::ParseCallbacks;
|
||||||
|
|
||||||
@@ -45,6 +46,43 @@ fn main() {
|
|||||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||||
println!("cargo:rerun-if-changed=pg_control_ffi.h");
|
println!("cargo:rerun-if-changed=pg_control_ffi.h");
|
||||||
|
|
||||||
|
// Finding the location of C headers for the Postgres server:
|
||||||
|
// - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/tmp_install`
|
||||||
|
// - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/tmp_install/include/postgresql/server`
|
||||||
|
let mut pg_install_dir: PathBuf;
|
||||||
|
if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
|
||||||
|
pg_install_dir = postgres_install_dir.into();
|
||||||
|
} else {
|
||||||
|
pg_install_dir = PathBuf::from("tmp_install")
|
||||||
|
}
|
||||||
|
|
||||||
|
if pg_install_dir.is_relative() {
|
||||||
|
let cwd = env::current_dir().unwrap();
|
||||||
|
pg_install_dir = cwd.join("..").join("..").join(pg_install_dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
let pg_config_bin = pg_install_dir.join("bin").join("pg_config");
|
||||||
|
let inc_server_path: String = if pg_config_bin.exists() {
|
||||||
|
let output = Command::new(pg_config_bin)
|
||||||
|
.arg("--includedir-server")
|
||||||
|
.output()
|
||||||
|
.expect("failed to execute `pg_config --includedir-server`");
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
panic!("`pg_config --includedir-server` failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
String::from_utf8(output.stdout).unwrap().trim_end().into()
|
||||||
|
} else {
|
||||||
|
pg_install_dir
|
||||||
|
.join("include")
|
||||||
|
.join("postgresql")
|
||||||
|
.join("server")
|
||||||
|
.into_os_string()
|
||||||
|
.into_string()
|
||||||
|
.unwrap()
|
||||||
|
};
|
||||||
|
|
||||||
// The bindgen::Builder is the main entry point
|
// The bindgen::Builder is the main entry point
|
||||||
// to bindgen, and lets you build up options for
|
// to bindgen, and lets you build up options for
|
||||||
// the resulting bindings.
|
// the resulting bindings.
|
||||||
@@ -81,15 +119,7 @@ fn main() {
|
|||||||
// explicit padding fields.
|
// explicit padding fields.
|
||||||
.explicit_padding(true)
|
.explicit_padding(true)
|
||||||
//
|
//
|
||||||
// Path the server include dir. It is in tmp_install/include/server, if you did
|
.clang_arg(format!("-I{inc_server_path}"))
|
||||||
// "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
|
|
||||||
// and used DESTDIR to move it into tmp_install, then it's in
|
|
||||||
// tmp_install/include/postgres/server
|
|
||||||
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
|
|
||||||
// but this will do for now.
|
|
||||||
//
|
|
||||||
.clang_arg("-I../../tmp_install/include/server")
|
|
||||||
.clang_arg("-I../../tmp_install/include/postgresql/server")
|
|
||||||
//
|
//
|
||||||
// Finish the builder and generate the bindings.
|
// Finish the builder and generate the bindings.
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -82,7 +82,17 @@ impl WalStreamDecoder {
|
|||||||
// that cross page boundaries.
|
// that cross page boundaries.
|
||||||
loop {
|
loop {
|
||||||
// parse and verify page boundaries as we go
|
// parse and verify page boundaries as we go
|
||||||
if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
|
if self.padlen > 0 {
|
||||||
|
// We should first skip padding, as we may have to skip some page headers if we're processing the XLOG_SWITCH record.
|
||||||
|
if self.inputbuf.remaining() < self.padlen as usize {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
// skip padding
|
||||||
|
self.inputbuf.advance(self.padlen as usize);
|
||||||
|
self.lsn += self.padlen as u64;
|
||||||
|
self.padlen = 0;
|
||||||
|
} else if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
|
||||||
// parse long header
|
// parse long header
|
||||||
|
|
||||||
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
|
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
|
||||||
@@ -128,15 +138,6 @@ impl WalStreamDecoder {
|
|||||||
|
|
||||||
self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
|
self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
|
||||||
continue;
|
continue;
|
||||||
} else if self.padlen > 0 {
|
|
||||||
if self.inputbuf.remaining() < self.padlen as usize {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
// skip padding
|
|
||||||
self.inputbuf.advance(self.padlen as usize);
|
|
||||||
self.lsn += self.padlen as u64;
|
|
||||||
self.padlen = 0;
|
|
||||||
} else if self.contlen == 0 {
|
} else if self.contlen == 0 {
|
||||||
assert!(self.recordbuf.is_empty());
|
assert!(self.recordbuf.is_empty());
|
||||||
|
|
||||||
@@ -226,10 +227,10 @@ impl WalStreamDecoder {
|
|||||||
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Always align resulting LSN on 0x8 boundary -- that is important for getPage()
|
// We should return LSN of the next record, not the last byte of this record or
|
||||||
// and WalReceiver integration. Since this code is used both for WalReceiver and
|
// the byte immediately after. Note that this handles both XLOG_SWITCH and usual
|
||||||
// initial WAL import let's force alignment right here.
|
// records, the former "spans" until the next WAL segment (see test_xlog_switch).
|
||||||
let result = (self.lsn.align(), recordbuf);
|
let result = (self.lsn + self.padlen as u64, recordbuf);
|
||||||
Ok(Some(result))
|
Ok(Some(result))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -597,19 +597,18 @@ mod tests {
|
|||||||
fn init_logging() {
|
fn init_logging() {
|
||||||
let _ = env_logger::Builder::from_env(
|
let _ = env_logger::Builder::from_env(
|
||||||
env_logger::Env::default()
|
env_logger::Env::default()
|
||||||
.default_filter_or("wal_generate=info,postgres_ffi::xlog_utils=trace"),
|
.default_filter_or("wal_craft=info,postgres_ffi::xlog_utils=trace"),
|
||||||
)
|
)
|
||||||
.is_test(true)
|
.is_test(true)
|
||||||
.try_init();
|
.try_init();
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_end_of_wal(
|
fn test_end_of_wal<C: wal_craft::Crafter>(
|
||||||
test_name: &str,
|
test_name: &str,
|
||||||
generate_wal: impl Fn(&mut postgres::Client) -> anyhow::Result<postgres::types::PgLsn>,
|
|
||||||
expected_end_of_wal_non_partial: Lsn,
|
expected_end_of_wal_non_partial: Lsn,
|
||||||
last_segment: &str,
|
last_segment: &str,
|
||||||
) {
|
) {
|
||||||
use wal_generate::*;
|
use wal_craft::*;
|
||||||
// 1. Generate some WAL
|
// 1. Generate some WAL
|
||||||
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||||
.join("..")
|
.join("..")
|
||||||
@@ -622,9 +621,9 @@ mod tests {
|
|||||||
fs::remove_dir_all(&cfg.datadir).unwrap();
|
fs::remove_dir_all(&cfg.datadir).unwrap();
|
||||||
}
|
}
|
||||||
cfg.initdb().unwrap();
|
cfg.initdb().unwrap();
|
||||||
let mut srv = cfg.start_server().unwrap();
|
let srv = cfg.start_server().unwrap();
|
||||||
let expected_wal_end: Lsn =
|
let expected_wal_end: Lsn =
|
||||||
u64::from(generate_wal(&mut srv.connect_with_timeout().unwrap()).unwrap()).into();
|
u64::from(C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap()).into();
|
||||||
srv.kill();
|
srv.kill();
|
||||||
|
|
||||||
// 2. Pick WAL generated by initdb
|
// 2. Pick WAL generated by initdb
|
||||||
@@ -681,9 +680,8 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
pub fn test_find_end_of_wal_simple() {
|
pub fn test_find_end_of_wal_simple() {
|
||||||
init_logging();
|
init_logging();
|
||||||
test_end_of_wal(
|
test_end_of_wal::<wal_craft::Simple>(
|
||||||
"test_find_end_of_wal_simple",
|
"test_find_end_of_wal_simple",
|
||||||
wal_generate::generate_simple,
|
|
||||||
"0/2000000".parse::<Lsn>().unwrap(),
|
"0/2000000".parse::<Lsn>().unwrap(),
|
||||||
"000000010000000000000001",
|
"000000010000000000000001",
|
||||||
);
|
);
|
||||||
@@ -692,9 +690,8 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
|
pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
|
||||||
init_logging();
|
init_logging();
|
||||||
test_end_of_wal(
|
test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
|
||||||
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
|
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
|
||||||
wal_generate::generate_wal_record_crossing_segment_followed_by_small_one,
|
|
||||||
"0/3000000".parse::<Lsn>().unwrap(),
|
"0/3000000".parse::<Lsn>().unwrap(),
|
||||||
"000000010000000000000002",
|
"000000010000000000000002",
|
||||||
);
|
);
|
||||||
@@ -704,9 +701,8 @@ mod tests {
|
|||||||
#[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO
|
#[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO
|
||||||
pub fn test_find_end_of_wal_last_crossing_segment() {
|
pub fn test_find_end_of_wal_last_crossing_segment() {
|
||||||
init_logging();
|
init_logging();
|
||||||
test_end_of_wal(
|
test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
|
||||||
"test_find_end_of_wal_last_crossing_segment",
|
"test_find_end_of_wal_last_crossing_segment",
|
||||||
wal_generate::generate_last_wal_record_crossing_segment,
|
|
||||||
"0/3000000".parse::<Lsn>().unwrap(),
|
"0/3000000".parse::<Lsn>().unwrap(),
|
||||||
"000000010000000000000002",
|
"000000010000000000000002",
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "wal_generate"
|
name = "wal_craft"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
@@ -10,5 +10,7 @@ anyhow = "1.0"
|
|||||||
clap = "3.0"
|
clap = "3.0"
|
||||||
env_logger = "0.9"
|
env_logger = "0.9"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
|
once_cell = "1.8.0"
|
||||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||||
|
postgres_ffi = { path = "../" }
|
||||||
tempfile = "3.2"
|
tempfile = "3.2"
|
||||||
100
libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
Normal file
100
libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
use anyhow::*;
|
||||||
|
use clap::{App, Arg, ArgMatches};
|
||||||
|
use std::str::FromStr;
|
||||||
|
use wal_craft::*;
|
||||||
|
|
||||||
|
fn main() -> Result<()> {
|
||||||
|
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info"))
|
||||||
|
.init();
|
||||||
|
let type_arg = &Arg::new("type")
|
||||||
|
.takes_value(true)
|
||||||
|
.help("Type of WAL to craft")
|
||||||
|
.possible_values([
|
||||||
|
Simple::NAME,
|
||||||
|
LastWalRecordXlogSwitch::NAME,
|
||||||
|
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME,
|
||||||
|
WalRecordCrossingSegmentFollowedBySmallOne::NAME,
|
||||||
|
LastWalRecordCrossingSegment::NAME,
|
||||||
|
])
|
||||||
|
.required(true);
|
||||||
|
let arg_matches = App::new("Postgres WAL crafter")
|
||||||
|
.about("Crafts Postgres databases with specific WAL properties")
|
||||||
|
.subcommand(
|
||||||
|
App::new("print-postgres-config")
|
||||||
|
.about("Print the configuration required for PostgreSQL server before running this script")
|
||||||
|
)
|
||||||
|
.subcommand(
|
||||||
|
App::new("with-initdb")
|
||||||
|
.about("Craft WAL in a new data directory first initialized with initdb")
|
||||||
|
.arg(type_arg)
|
||||||
|
.arg(
|
||||||
|
Arg::new("datadir")
|
||||||
|
.takes_value(true)
|
||||||
|
.help("Data directory for the Postgres server")
|
||||||
|
.required(true)
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("pg-distrib-dir")
|
||||||
|
.long("pg-distrib-dir")
|
||||||
|
.takes_value(true)
|
||||||
|
.help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
|
||||||
|
.default_value("/usr/local")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.subcommand(
|
||||||
|
App::new("in-existing")
|
||||||
|
.about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.")
|
||||||
|
.arg(type_arg)
|
||||||
|
.arg(
|
||||||
|
Arg::new("connection")
|
||||||
|
.takes_value(true)
|
||||||
|
.help("Connection string to the Postgres database to populate")
|
||||||
|
.required(true)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.get_matches();
|
||||||
|
|
||||||
|
let wal_craft = |arg_matches: &ArgMatches, client| {
|
||||||
|
let lsn = match arg_matches.value_of("type").unwrap() {
|
||||||
|
Simple::NAME => Simple::craft(client)?,
|
||||||
|
LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?,
|
||||||
|
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => {
|
||||||
|
LastWalRecordXlogSwitchEndsOnPageBoundary::craft(client)?
|
||||||
|
}
|
||||||
|
WalRecordCrossingSegmentFollowedBySmallOne::NAME => {
|
||||||
|
WalRecordCrossingSegmentFollowedBySmallOne::craft(client)?
|
||||||
|
}
|
||||||
|
LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
|
||||||
|
a => panic!("Unknown --type argument: {}", a),
|
||||||
|
};
|
||||||
|
println!("end_of_wal = {}", lsn);
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
|
||||||
|
match arg_matches.subcommand() {
|
||||||
|
None => panic!("No subcommand provided"),
|
||||||
|
Some(("print-postgres-config", _)) => {
|
||||||
|
for cfg in REQUIRED_POSTGRES_CONFIG.iter() {
|
||||||
|
println!("{}", cfg);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Some(("with-initdb", arg_matches)) => {
|
||||||
|
let cfg = Conf {
|
||||||
|
pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
|
||||||
|
datadir: arg_matches.value_of("datadir").unwrap().into(),
|
||||||
|
};
|
||||||
|
cfg.initdb()?;
|
||||||
|
let srv = cfg.start_server()?;
|
||||||
|
wal_craft(arg_matches, &mut srv.connect_with_timeout()?)?;
|
||||||
|
srv.kill();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Some(("in-existing", arg_matches)) => wal_craft(
|
||||||
|
arg_matches,
|
||||||
|
&mut postgres::Config::from_str(arg_matches.value_of("connection").unwrap())?
|
||||||
|
.connect(postgres::NoTls)?,
|
||||||
|
),
|
||||||
|
Some(_) => panic!("Unknown subcommand"),
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,9 +1,14 @@
|
|||||||
use anyhow::*;
|
use anyhow::*;
|
||||||
use core::time::Duration;
|
use core::time::Duration;
|
||||||
use log::*;
|
use log::*;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
use postgres::types::PgLsn;
|
use postgres::types::PgLsn;
|
||||||
use postgres::Client;
|
use postgres::Client;
|
||||||
|
use postgres_ffi::xlog_utils::{
|
||||||
|
XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||||
|
};
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
use std::fs;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::process::{Command, Stdio};
|
use std::process::{Command, Stdio};
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
@@ -21,6 +26,16 @@ pub struct PostgresServer {
|
|||||||
client_config: postgres::Config,
|
client_config: postgres::Config,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub static REQUIRED_POSTGRES_CONFIG: Lazy<Vec<&'static str>> = Lazy::new(|| {
|
||||||
|
vec![
|
||||||
|
"wal_keep_size=50MB", // Ensure old WAL is not removed
|
||||||
|
"shared_preload_libraries=neon", // can only be loaded at startup
|
||||||
|
// Disable background processes as much as possible
|
||||||
|
"wal_writer_delay=10s",
|
||||||
|
"autovacuum=off",
|
||||||
|
]
|
||||||
|
});
|
||||||
|
|
||||||
impl Conf {
|
impl Conf {
|
||||||
fn pg_bin_dir(&self) -> PathBuf {
|
fn pg_bin_dir(&self) -> PathBuf {
|
||||||
self.pg_distrib_dir.join("bin")
|
self.pg_distrib_dir.join("bin")
|
||||||
@@ -69,6 +84,12 @@ impl Conf {
|
|||||||
|
|
||||||
pub fn start_server(&self) -> Result<PostgresServer> {
|
pub fn start_server(&self) -> Result<PostgresServer> {
|
||||||
info!("Starting Postgres server in {:?}", self.datadir);
|
info!("Starting Postgres server in {:?}", self.datadir);
|
||||||
|
let log_file = fs::File::create(self.datadir.join("pg.log")).with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to create pg.log file in directory {}",
|
||||||
|
self.datadir.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
|
let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
|
||||||
let unix_socket_dir_path = unix_socket_dir.path().to_owned();
|
let unix_socket_dir_path = unix_socket_dir.path().to_owned();
|
||||||
let server_process = self
|
let server_process = self
|
||||||
@@ -78,13 +99,9 @@ impl Conf {
|
|||||||
.arg(unix_socket_dir_path.as_os_str())
|
.arg(unix_socket_dir_path.as_os_str())
|
||||||
.arg("-D")
|
.arg("-D")
|
||||||
.arg(self.datadir.as_os_str())
|
.arg(self.datadir.as_os_str())
|
||||||
.args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed
|
|
||||||
.args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
|
.args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
|
||||||
.args(&["-c", "shared_preload_libraries=neon"]) // can only be loaded at startup
|
.args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
|
||||||
// Disable background processes as much as possible
|
.stderr(Stdio::from(log_file))
|
||||||
.args(&["-c", "wal_writer_delay=10s"])
|
|
||||||
.args(&["-c", "autovacuum=off"])
|
|
||||||
.stderr(Stdio::null())
|
|
||||||
.spawn()?;
|
.spawn()?;
|
||||||
let server = PostgresServer {
|
let server = PostgresServer {
|
||||||
process: server_process,
|
process: server_process,
|
||||||
@@ -137,7 +154,7 @@ impl PostgresServer {
|
|||||||
bail!("Connection timed out");
|
bail!("Connection timed out");
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn kill(&mut self) {
|
pub fn kill(mut self) {
|
||||||
self.process.kill().unwrap();
|
self.process.kill().unwrap();
|
||||||
self.process.wait().unwrap();
|
self.process.wait().unwrap();
|
||||||
}
|
}
|
||||||
@@ -174,12 +191,16 @@ pub trait PostgresClientExt: postgres::GenericClient {
|
|||||||
|
|
||||||
impl<C: postgres::GenericClient> PostgresClientExt for C {}
|
impl<C: postgres::GenericClient> PostgresClientExt for C {}
|
||||||
|
|
||||||
fn generate_internal<C: postgres::GenericClient>(
|
pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> {
|
||||||
client: &mut C,
|
|
||||||
f: impl Fn(&mut C, PgLsn) -> Result<Option<PgLsn>>,
|
|
||||||
) -> Result<PgLsn> {
|
|
||||||
client.execute("create extension if not exists neon_test_utils", &[])?;
|
client.execute("create extension if not exists neon_test_utils", &[])?;
|
||||||
|
|
||||||
|
let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0);
|
||||||
|
ensure!(wal_keep_size == "50MB");
|
||||||
|
let wal_writer_delay: String = client.query_one("SHOW wal_writer_delay", &[])?.get(0);
|
||||||
|
ensure!(wal_writer_delay == "10s");
|
||||||
|
let autovacuum: String = client.query_one("SHOW autovacuum", &[])?.get(0);
|
||||||
|
ensure!(autovacuum == "off");
|
||||||
|
|
||||||
let wal_segment_size = client.query_one(
|
let wal_segment_size = client.query_one(
|
||||||
"select cast(setting as bigint) as setting, unit \
|
"select cast(setting as bigint) as setting, unit \
|
||||||
from pg_settings where name = 'wal_segment_size'",
|
from pg_settings where name = 'wal_segment_size'",
|
||||||
@@ -194,13 +215,29 @@ fn generate_internal<C: postgres::GenericClient>(
|
|||||||
"Unexpected wal_segment_size in bytes"
|
"Unexpected wal_segment_size in bytes"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait Crafter {
|
||||||
|
const NAME: &'static str;
|
||||||
|
|
||||||
|
/// Generates WAL using the client `client`. Returns the expected end-of-wal LSN.
|
||||||
|
fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn>;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn craft_internal<C: postgres::GenericClient>(
|
||||||
|
client: &mut C,
|
||||||
|
f: impl Fn(&mut C, PgLsn) -> Result<Option<PgLsn>>,
|
||||||
|
) -> Result<PgLsn> {
|
||||||
|
ensure_server_config(client)?;
|
||||||
|
|
||||||
let initial_lsn = client.pg_current_wal_insert_lsn()?;
|
let initial_lsn = client.pg_current_wal_insert_lsn()?;
|
||||||
info!("LSN initial = {}", initial_lsn);
|
info!("LSN initial = {}", initial_lsn);
|
||||||
|
|
||||||
let last_lsn = match f(client, initial_lsn)? {
|
let last_lsn = match f(client, initial_lsn)? {
|
||||||
None => client.pg_current_wal_insert_lsn()?,
|
None => client.pg_current_wal_insert_lsn()?,
|
||||||
Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
|
Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
|
||||||
Ordering::Less => bail!("Some records were inserted after the generated WAL"),
|
Ordering::Less => bail!("Some records were inserted after the crafted WAL"),
|
||||||
Ordering::Equal => last_lsn,
|
Ordering::Equal => last_lsn,
|
||||||
Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
|
Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
|
||||||
},
|
},
|
||||||
@@ -209,25 +246,116 @@ fn generate_internal<C: postgres::GenericClient>(
|
|||||||
// Some records may be not flushed, e.g. non-transactional logical messages.
|
// Some records may be not flushed, e.g. non-transactional logical messages.
|
||||||
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
||||||
match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
|
match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
|
||||||
Ordering::Less => bail!("Some records were flushed after the generated WAL"),
|
Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
|
||||||
Ordering::Equal => {}
|
Ordering::Equal => {}
|
||||||
Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
|
Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
|
||||||
}
|
}
|
||||||
Ok(last_lsn)
|
Ok(last_lsn)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
pub struct Simple;
|
||||||
generate_internal(client, |client, _| {
|
impl Crafter for Simple {
|
||||||
client.execute("CREATE table t(x int)", &[])?;
|
const NAME: &'static str = "simple";
|
||||||
Ok(None)
|
fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||||
})
|
craft_internal(client, |client, _| {
|
||||||
|
client.execute("CREATE table t(x int)", &[])?;
|
||||||
|
Ok(None)
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn generate_single_logical_message(
|
pub struct LastWalRecordXlogSwitch;
|
||||||
|
impl Crafter for LastWalRecordXlogSwitch {
|
||||||
|
const NAME: &'static str = "last_wal_record_xlog_switch";
|
||||||
|
fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||||
|
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
||||||
|
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
||||||
|
ensure_server_config(client)?;
|
||||||
|
|
||||||
|
client.execute("CREATE table t(x int)", &[])?;
|
||||||
|
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||||
|
let next_segment = PgLsn::from(0x0200_0000);
|
||||||
|
ensure!(
|
||||||
|
after_xlog_switch <= next_segment,
|
||||||
|
"XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
|
||||||
|
after_xlog_switch,
|
||||||
|
next_segment
|
||||||
|
);
|
||||||
|
Ok(next_segment)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
|
||||||
|
impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||||
|
const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
|
||||||
|
fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||||
|
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
||||||
|
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
||||||
|
ensure_server_config(client)?;
|
||||||
|
|
||||||
|
client.execute("CREATE table t(x int)", &[])?;
|
||||||
|
|
||||||
|
// Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
|
||||||
|
// We will use logical message as the padding. We start with detecting how much WAL
|
||||||
|
// it takes for one logical message, considering all alignments and headers.
|
||||||
|
let base_wal_advance = {
|
||||||
|
let before_lsn = client.pg_current_wal_insert_lsn()?;
|
||||||
|
// Small non-empty message bigger than few bytes is more likely than an empty
|
||||||
|
// message to have the same format as the big padding message.
|
||||||
|
client.execute(
|
||||||
|
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
|
||||||
|
&[],
|
||||||
|
)?;
|
||||||
|
// The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
|
||||||
|
(u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
|
||||||
|
+ XLOG_SIZE_OF_XLOG_RECORD
|
||||||
|
};
|
||||||
|
let mut remaining_lsn =
|
||||||
|
XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
|
||||||
|
if remaining_lsn < base_wal_advance {
|
||||||
|
remaining_lsn += XLOG_BLCKSZ;
|
||||||
|
}
|
||||||
|
let repeats = 10 + remaining_lsn - base_wal_advance;
|
||||||
|
info!(
|
||||||
|
"current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
|
||||||
|
client.pg_current_wal_insert_lsn()?,
|
||||||
|
remaining_lsn,
|
||||||
|
base_wal_advance,
|
||||||
|
repeats
|
||||||
|
);
|
||||||
|
client.execute(
|
||||||
|
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
|
||||||
|
&[&(repeats as i32)],
|
||||||
|
)?;
|
||||||
|
info!(
|
||||||
|
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
||||||
|
client.pg_current_wal_insert_lsn()?,
|
||||||
|
XLOG_SIZE_OF_XLOG_RECORD
|
||||||
|
);
|
||||||
|
|
||||||
|
// Emit the XLOG_SWITCH
|
||||||
|
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||||
|
let next_segment = PgLsn::from(0x0200_0000);
|
||||||
|
ensure!(
|
||||||
|
after_xlog_switch < next_segment,
|
||||||
|
"XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
|
||||||
|
after_xlog_switch,
|
||||||
|
next_segment
|
||||||
|
);
|
||||||
|
ensure!(
|
||||||
|
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||||
|
"XLOG_SWITCH message ended not on page boundary: {}",
|
||||||
|
after_xlog_switch
|
||||||
|
);
|
||||||
|
Ok(next_segment)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn craft_single_logical_message(
|
||||||
client: &mut impl postgres::GenericClient,
|
client: &mut impl postgres::GenericClient,
|
||||||
transactional: bool,
|
transactional: bool,
|
||||||
) -> Result<PgLsn> {
|
) -> Result<PgLsn> {
|
||||||
generate_internal(client, |client, initial_lsn| {
|
craft_internal(client, |client, initial_lsn| {
|
||||||
ensure!(
|
ensure!(
|
||||||
initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
|
initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
|
||||||
"Initial LSN is too far in the future"
|
"Initial LSN is too far in the future"
|
||||||
@@ -265,14 +393,18 @@ fn generate_single_logical_message(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_wal_record_crossing_segment_followed_by_small_one(
|
pub struct WalRecordCrossingSegmentFollowedBySmallOne;
|
||||||
client: &mut impl postgres::GenericClient,
|
impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
|
||||||
) -> Result<PgLsn> {
|
const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
|
||||||
generate_single_logical_message(client, true)
|
fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||||
|
craft_single_logical_message(client, true)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_last_wal_record_crossing_segment<C: postgres::GenericClient>(
|
pub struct LastWalRecordCrossingSegment;
|
||||||
client: &mut C,
|
impl Crafter for LastWalRecordCrossingSegment {
|
||||||
) -> Result<PgLsn> {
|
const NAME: &'static str = "last_wal_record_crossing_segment";
|
||||||
generate_single_logical_message(client, false)
|
fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||||
|
craft_single_logical_message(client, false)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -1,58 +0,0 @@
|
|||||||
use anyhow::*;
|
|
||||||
use clap::{App, Arg};
|
|
||||||
use wal_generate::*;
|
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
|
||||||
env_logger::Builder::from_env(
|
|
||||||
env_logger::Env::default().default_filter_or("wal_generate=info"),
|
|
||||||
)
|
|
||||||
.init();
|
|
||||||
let arg_matches = App::new("Postgres WAL generator")
|
|
||||||
.about("Generates Postgres databases with specific WAL properties")
|
|
||||||
.arg(
|
|
||||||
Arg::new("datadir")
|
|
||||||
.short('D')
|
|
||||||
.long("datadir")
|
|
||||||
.takes_value(true)
|
|
||||||
.help("Data directory for the Postgres server")
|
|
||||||
.required(true)
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("pg-distrib-dir")
|
|
||||||
.long("pg-distrib-dir")
|
|
||||||
.takes_value(true)
|
|
||||||
.help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
|
|
||||||
.default_value("/usr/local")
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("type")
|
|
||||||
.long("type")
|
|
||||||
.takes_value(true)
|
|
||||||
.help("Type of WAL to generate")
|
|
||||||
.possible_values(["simple", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one"])
|
|
||||||
.required(true)
|
|
||||||
)
|
|
||||||
.get_matches();
|
|
||||||
|
|
||||||
let cfg = Conf {
|
|
||||||
pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
|
|
||||||
datadir: arg_matches.value_of("datadir").unwrap().into(),
|
|
||||||
};
|
|
||||||
cfg.initdb()?;
|
|
||||||
let mut srv = cfg.start_server()?;
|
|
||||||
let lsn = match arg_matches.value_of("type").unwrap() {
|
|
||||||
"simple" => generate_simple(&mut srv.connect_with_timeout()?)?,
|
|
||||||
"last_wal_record_crossing_segment" => {
|
|
||||||
generate_last_wal_record_crossing_segment(&mut srv.connect_with_timeout()?)?
|
|
||||||
}
|
|
||||||
"wal_record_crossing_segment_followed_by_small_one" => {
|
|
||||||
generate_wal_record_crossing_segment_followed_by_small_one(
|
|
||||||
&mut srv.connect_with_timeout()?,
|
|
||||||
)?
|
|
||||||
}
|
|
||||||
a => panic!("Unknown --type argument: {}", a),
|
|
||||||
};
|
|
||||||
println!("end_of_wal = {}", lsn);
|
|
||||||
srv.kill();
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -12,8 +12,10 @@ use std::{
|
|||||||
borrow::Cow,
|
borrow::Cow,
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
ffi::OsStr,
|
ffi::OsStr,
|
||||||
|
fmt::Debug,
|
||||||
num::{NonZeroU32, NonZeroUsize},
|
num::{NonZeroU32, NonZeroUsize},
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
|
pin::Pin,
|
||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
@@ -40,13 +42,19 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
|||||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||||
|
|
||||||
|
pub trait RemoteObjectName {
|
||||||
|
// Needed to retrieve last component for RemoteObjectId.
|
||||||
|
// In other words a file name
|
||||||
|
fn object_name(&self) -> Option<&str>;
|
||||||
|
}
|
||||||
|
|
||||||
/// Storage (potentially remote) API to manage its state.
|
/// Storage (potentially remote) API to manage its state.
|
||||||
/// This storage tries to be unaware of any layered repository context,
|
/// This storage tries to be unaware of any layered repository context,
|
||||||
/// providing basic CRUD operations for storage files.
|
/// providing basic CRUD operations for storage files.
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
pub trait RemoteStorage: Send + Sync {
|
pub trait RemoteStorage: Send + Sync {
|
||||||
/// A way to uniquely reference a file in the remote storage.
|
/// A way to uniquely reference a file in the remote storage.
|
||||||
type RemoteObjectId;
|
type RemoteObjectId: RemoteObjectName;
|
||||||
|
|
||||||
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
||||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
|
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
|
||||||
@@ -57,6 +65,12 @@ pub trait RemoteStorage: Send + Sync {
|
|||||||
/// Lists all items the storage has right now.
|
/// Lists all items the storage has right now.
|
||||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
|
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
|
||||||
|
|
||||||
|
/// Lists all top level subdirectories for a given prefix
|
||||||
|
async fn list_prefixes(
|
||||||
|
&self,
|
||||||
|
prefix: Option<Self::RemoteObjectId>,
|
||||||
|
) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
|
||||||
|
|
||||||
/// Streams the local file contents into remote into the remote storage entry.
|
/// Streams the local file contents into remote into the remote storage entry.
|
||||||
async fn upload(
|
async fn upload(
|
||||||
&self,
|
&self,
|
||||||
@@ -70,11 +84,7 @@ pub trait RemoteStorage: Send + Sync {
|
|||||||
|
|
||||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||||
/// Returns the metadata, if any was stored with the file previously.
|
/// Returns the metadata, if any was stored with the file previously.
|
||||||
async fn download(
|
async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError>;
|
||||||
&self,
|
|
||||||
from: &Self::RemoteObjectId,
|
|
||||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
|
||||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
|
||||||
|
|
||||||
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||||
/// Returns the metadata, if any was stored with the file previously.
|
/// Returns the metadata, if any was stored with the file previously.
|
||||||
@@ -83,12 +93,49 @@ pub trait RemoteStorage: Send + Sync {
|
|||||||
from: &Self::RemoteObjectId,
|
from: &Self::RemoteObjectId,
|
||||||
start_inclusive: u64,
|
start_inclusive: u64,
|
||||||
end_exclusive: Option<u64>,
|
end_exclusive: Option<u64>,
|
||||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
) -> Result<Download, DownloadError>;
|
||||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
|
||||||
|
|
||||||
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
|
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct Download {
|
||||||
|
pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send>>,
|
||||||
|
/// Extra key-value data, associated with the current remote file.
|
||||||
|
pub metadata: Option<StorageMetadata>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Debug for Download {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
f.debug_struct("Download")
|
||||||
|
.field("metadata", &self.metadata)
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum DownloadError {
|
||||||
|
/// Validation or other error happened due to user input.
|
||||||
|
BadInput(anyhow::Error),
|
||||||
|
/// The file was not found in the remote storage.
|
||||||
|
NotFound,
|
||||||
|
/// The file was found in the remote storage, but the download failed.
|
||||||
|
Other(anyhow::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for DownloadError {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
DownloadError::BadInput(e) => {
|
||||||
|
write!(f, "Failed to download a remote file due to user input: {e}")
|
||||||
|
}
|
||||||
|
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
|
||||||
|
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::error::Error for DownloadError {}
|
||||||
|
|
||||||
/// Every storage, currently supported.
|
/// Every storage, currently supported.
|
||||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||||
pub enum GenericRemoteStorage {
|
pub enum GenericRemoteStorage {
|
||||||
@@ -180,7 +227,7 @@ pub struct S3Config {
|
|||||||
pub concurrency_limit: NonZeroUsize,
|
pub concurrency_limit: NonZeroUsize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for S3Config {
|
impl Debug for S3Config {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
f.debug_struct("S3Config")
|
f.debug_struct("S3Config")
|
||||||
.field("bucket_name", &self.bucket_name)
|
.field("bucket_name", &self.bucket_name)
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
//! volume is mounted to the local FS.
|
//! volume is mounted to the local FS.
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
|
borrow::Cow,
|
||||||
future::Future,
|
future::Future,
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
pin::Pin,
|
pin::Pin,
|
||||||
@@ -17,10 +18,16 @@ use tokio::{
|
|||||||
};
|
};
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
use crate::path_with_suffix_extension;
|
use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectName};
|
||||||
|
|
||||||
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
|
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
|
||||||
|
|
||||||
|
impl RemoteObjectName for PathBuf {
|
||||||
|
fn object_name(&self) -> Option<&str> {
|
||||||
|
self.file_stem().and_then(|n| n.to_str())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct LocalFs {
|
pub struct LocalFs {
|
||||||
working_directory: PathBuf,
|
working_directory: PathBuf,
|
||||||
storage_root: PathBuf,
|
storage_root: PathBuf,
|
||||||
@@ -101,7 +108,18 @@ impl RemoteStorage for LocalFs {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||||
get_all_files(&self.storage_root).await
|
get_all_files(&self.storage_root, true).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_prefixes(
|
||||||
|
&self,
|
||||||
|
prefix: Option<Self::RemoteObjectId>,
|
||||||
|
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||||
|
let path = match prefix {
|
||||||
|
Some(prefix) => Cow::Owned(self.storage_root.join(prefix)),
|
||||||
|
None => Cow::Borrowed(&self.storage_root),
|
||||||
|
};
|
||||||
|
get_all_files(path.as_ref(), false).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn upload(
|
async fn upload(
|
||||||
@@ -192,15 +210,12 @@ impl RemoteStorage for LocalFs {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download(
|
async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError> {
|
||||||
&self,
|
let file_path = self
|
||||||
from: &Self::RemoteObjectId,
|
.resolve_in_storage(from)
|
||||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
.map_err(DownloadError::BadInput)?;
|
||||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
if file_exists(&file_path).map_err(DownloadError::BadInput)? {
|
||||||
let file_path = self.resolve_in_storage(from)?;
|
let source = io::BufReader::new(
|
||||||
|
|
||||||
if file_path.exists() && file_path.is_file() {
|
|
||||||
let mut source = io::BufReader::new(
|
|
||||||
fs::OpenOptions::new()
|
fs::OpenOptions::new()
|
||||||
.read(true)
|
.read(true)
|
||||||
.open(&file_path)
|
.open(&file_path)
|
||||||
@@ -210,22 +225,20 @@ impl RemoteStorage for LocalFs {
|
|||||||
"Failed to open source file '{}' to use in the download",
|
"Failed to open source file '{}' to use in the download",
|
||||||
file_path.display()
|
file_path.display()
|
||||||
)
|
)
|
||||||
})?,
|
})
|
||||||
|
.map_err(DownloadError::Other)?,
|
||||||
);
|
);
|
||||||
io::copy(&mut source, to).await.with_context(|| {
|
|
||||||
format!(
|
|
||||||
"Failed to download file '{}' from the local storage",
|
|
||||||
file_path.display()
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
source.flush().await?;
|
|
||||||
|
|
||||||
self.read_storage_metadata(&file_path).await
|
let metadata = self
|
||||||
|
.read_storage_metadata(&file_path)
|
||||||
|
.await
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
Ok(Download {
|
||||||
|
metadata,
|
||||||
|
download_stream: Box::pin(source),
|
||||||
|
})
|
||||||
} else {
|
} else {
|
||||||
bail!(
|
Err(DownloadError::NotFound)
|
||||||
"File '{}' either does not exist or is not a file",
|
|
||||||
file_path.display()
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -234,22 +247,19 @@ impl RemoteStorage for LocalFs {
|
|||||||
from: &Self::RemoteObjectId,
|
from: &Self::RemoteObjectId,
|
||||||
start_inclusive: u64,
|
start_inclusive: u64,
|
||||||
end_exclusive: Option<u64>,
|
end_exclusive: Option<u64>,
|
||||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
) -> Result<Download, DownloadError> {
|
||||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
|
||||||
if let Some(end_exclusive) = end_exclusive {
|
if let Some(end_exclusive) = end_exclusive {
|
||||||
ensure!(
|
if end_exclusive <= start_inclusive {
|
||||||
end_exclusive > start_inclusive,
|
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) is not less than end_exclusive ({end_exclusive:?})")));
|
||||||
"Invalid range, start ({}) is bigger then end ({:?})",
|
};
|
||||||
start_inclusive,
|
|
||||||
end_exclusive
|
|
||||||
);
|
|
||||||
if start_inclusive == end_exclusive.saturating_sub(1) {
|
if start_inclusive == end_exclusive.saturating_sub(1) {
|
||||||
return Ok(None);
|
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let file_path = self.resolve_in_storage(from)?;
|
let file_path = self
|
||||||
|
.resolve_in_storage(from)
|
||||||
if file_path.exists() && file_path.is_file() {
|
.map_err(DownloadError::BadInput)?;
|
||||||
|
if file_exists(&file_path).map_err(DownloadError::BadInput)? {
|
||||||
let mut source = io::BufReader::new(
|
let mut source = io::BufReader::new(
|
||||||
fs::OpenOptions::new()
|
fs::OpenOptions::new()
|
||||||
.read(true)
|
.read(true)
|
||||||
@@ -260,31 +270,31 @@ impl RemoteStorage for LocalFs {
|
|||||||
"Failed to open source file '{}' to use in the download",
|
"Failed to open source file '{}' to use in the download",
|
||||||
file_path.display()
|
file_path.display()
|
||||||
)
|
)
|
||||||
})?,
|
})
|
||||||
|
.map_err(DownloadError::Other)?,
|
||||||
);
|
);
|
||||||
source
|
source
|
||||||
.seek(io::SeekFrom::Start(start_inclusive))
|
.seek(io::SeekFrom::Start(start_inclusive))
|
||||||
.await
|
.await
|
||||||
.context("Failed to seek to the range start in a local storage file")?;
|
.context("Failed to seek to the range start in a local storage file")
|
||||||
match end_exclusive {
|
.map_err(DownloadError::Other)?;
|
||||||
Some(end_exclusive) => {
|
let metadata = self
|
||||||
io::copy(&mut source.take(end_exclusive - start_inclusive), to).await
|
.read_storage_metadata(&file_path)
|
||||||
}
|
.await
|
||||||
None => io::copy(&mut source, to).await,
|
.map_err(DownloadError::Other)?;
|
||||||
}
|
|
||||||
.with_context(|| {
|
|
||||||
format!(
|
|
||||||
"Failed to download file '{}' range from the local storage",
|
|
||||||
file_path.display()
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
|
|
||||||
self.read_storage_metadata(&file_path).await
|
Ok(match end_exclusive {
|
||||||
|
Some(end_exclusive) => Download {
|
||||||
|
metadata,
|
||||||
|
download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
|
||||||
|
},
|
||||||
|
None => Download {
|
||||||
|
metadata,
|
||||||
|
download_stream: Box::pin(source),
|
||||||
|
},
|
||||||
|
})
|
||||||
} else {
|
} else {
|
||||||
bail!(
|
Err(DownloadError::NotFound)
|
||||||
"File '{}' either does not exist or is not a file",
|
|
||||||
file_path.display()
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -307,6 +317,7 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf {
|
|||||||
|
|
||||||
fn get_all_files<'a, P>(
|
fn get_all_files<'a, P>(
|
||||||
directory_path: P,
|
directory_path: P,
|
||||||
|
recursive: bool,
|
||||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
|
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
|
||||||
where
|
where
|
||||||
P: AsRef<Path> + Send + Sync + 'a,
|
P: AsRef<Path> + Send + Sync + 'a,
|
||||||
@@ -323,7 +334,11 @@ where
|
|||||||
if file_type.is_symlink() {
|
if file_type.is_symlink() {
|
||||||
debug!("{:?} us a symlink, skipping", entry_path)
|
debug!("{:?} us a symlink, skipping", entry_path)
|
||||||
} else if file_type.is_dir() {
|
} else if file_type.is_dir() {
|
||||||
paths.extend(get_all_files(entry_path).await?.into_iter())
|
if recursive {
|
||||||
|
paths.extend(get_all_files(entry_path, true).await?.into_iter())
|
||||||
|
} else {
|
||||||
|
paths.push(dir_entry.path())
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
paths.push(dir_entry.path());
|
paths.push(dir_entry.path());
|
||||||
}
|
}
|
||||||
@@ -352,6 +367,19 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
|
||||||
|
if file_path.exists() {
|
||||||
|
ensure!(
|
||||||
|
file_path.is_file(),
|
||||||
|
"file path '{}' is not a file",
|
||||||
|
file_path.display()
|
||||||
|
);
|
||||||
|
Ok(true)
|
||||||
|
} else {
|
||||||
|
Ok(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod pure_tests {
|
mod pure_tests {
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
@@ -518,6 +546,31 @@ mod fs_tests {
|
|||||||
use std::{collections::HashMap, io::Write};
|
use std::{collections::HashMap, io::Write};
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
async fn read_and_assert_remote_file_contents(
|
||||||
|
storage: &LocalFs,
|
||||||
|
#[allow(clippy::ptr_arg)]
|
||||||
|
// have to use &PathBuf due to `storage.local_path` parameter requirements
|
||||||
|
remote_storage_path: &PathBuf,
|
||||||
|
expected_metadata: Option<&StorageMetadata>,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
let mut download = storage
|
||||||
|
.download(remote_storage_path)
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
|
||||||
|
ensure!(
|
||||||
|
download.metadata.as_ref() == expected_metadata,
|
||||||
|
"Unexpected metadata returned for the downloaded file"
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut contents = String::new();
|
||||||
|
download
|
||||||
|
.download_stream
|
||||||
|
.read_to_string(&mut contents)
|
||||||
|
.await
|
||||||
|
.context("Failed to read remote file contents into string")?;
|
||||||
|
Ok(contents)
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn upload_file() -> anyhow::Result<()> {
|
async fn upload_file() -> anyhow::Result<()> {
|
||||||
let workdir = tempdir()?.path().to_owned();
|
let workdir = tempdir()?.path().to_owned();
|
||||||
@@ -568,15 +621,7 @@ mod fs_tests {
|
|||||||
let upload_name = "upload_1";
|
let upload_name = "upload_1";
|
||||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||||
|
|
||||||
let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
|
||||||
let metadata = storage.download(&upload_target, &mut content_bytes).await?;
|
|
||||||
assert!(
|
|
||||||
metadata.is_none(),
|
|
||||||
"No metadata should be returned for no metadata upload"
|
|
||||||
);
|
|
||||||
|
|
||||||
content_bytes.flush().await?;
|
|
||||||
let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
dummy_contents(upload_name),
|
dummy_contents(upload_name),
|
||||||
contents,
|
contents,
|
||||||
@@ -584,13 +629,9 @@ mod fs_tests {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let non_existing_path = PathBuf::from("somewhere").join("else");
|
let non_existing_path = PathBuf::from("somewhere").join("else");
|
||||||
match storage.download(&non_existing_path, &mut io::sink()).await {
|
match storage.download(&non_existing_path).await {
|
||||||
Ok(_) => panic!("Should not allow downloading non-existing storage files"),
|
Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
|
||||||
Err(e) => {
|
other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
|
||||||
let error_string = e.to_string();
|
|
||||||
assert!(error_string.contains("does not exist"));
|
|
||||||
assert!(error_string.contains(&non_existing_path.display().to_string()));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -603,58 +644,31 @@ mod fs_tests {
|
|||||||
let upload_name = "upload_1";
|
let upload_name = "upload_1";
|
||||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||||
|
|
||||||
let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
let full_range_download_contents =
|
||||||
let metadata = storage
|
read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
|
||||||
.download_byte_range(&upload_target, 0, None, &mut full_range_bytes)
|
|
||||||
.await?;
|
|
||||||
assert!(
|
|
||||||
metadata.is_none(),
|
|
||||||
"No metadata should be returned for no metadata upload"
|
|
||||||
);
|
|
||||||
full_range_bytes.flush().await?;
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
dummy_contents(upload_name),
|
dummy_contents(upload_name),
|
||||||
String::from_utf8(full_range_bytes.into_inner().into_inner())?,
|
full_range_download_contents,
|
||||||
"Download full range should return the whole upload"
|
"Download full range should return the whole upload"
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
|
||||||
let same_byte = 1_000_000_000;
|
|
||||||
let metadata = storage
|
|
||||||
.download_byte_range(
|
|
||||||
&upload_target,
|
|
||||||
same_byte,
|
|
||||||
Some(same_byte + 1), // exclusive end
|
|
||||||
&mut zero_range_bytes,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
assert!(
|
|
||||||
metadata.is_none(),
|
|
||||||
"No metadata should be returned for no metadata upload"
|
|
||||||
);
|
|
||||||
zero_range_bytes.flush().await?;
|
|
||||||
assert!(
|
|
||||||
zero_range_bytes.into_inner().into_inner().is_empty(),
|
|
||||||
"Zero byte range should not download any part of the file"
|
|
||||||
);
|
|
||||||
|
|
||||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||||
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
||||||
|
|
||||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
let mut first_part_download = storage
|
||||||
let metadata = storage
|
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||||
.download_byte_range(
|
|
||||||
&upload_target,
|
|
||||||
0,
|
|
||||||
Some(first_part_local.len() as u64),
|
|
||||||
&mut first_part_remote,
|
|
||||||
)
|
|
||||||
.await?;
|
.await?;
|
||||||
assert!(
|
assert!(
|
||||||
metadata.is_none(),
|
first_part_download.metadata.is_none(),
|
||||||
"No metadata should be returned for no metadata upload"
|
"No metadata should be returned for no metadata upload"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||||
|
io::copy(
|
||||||
|
&mut first_part_download.download_stream,
|
||||||
|
&mut first_part_remote,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
first_part_remote.flush().await?;
|
first_part_remote.flush().await?;
|
||||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -663,20 +677,24 @@ mod fs_tests {
|
|||||||
"First part bytes should be returned when requested"
|
"First part bytes should be returned when requested"
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
let mut second_part_download = storage
|
||||||
let metadata = storage
|
|
||||||
.download_byte_range(
|
.download_byte_range(
|
||||||
&upload_target,
|
&upload_target,
|
||||||
first_part_local.len() as u64,
|
first_part_local.len() as u64,
|
||||||
Some((first_part_local.len() + second_part_local.len()) as u64),
|
Some((first_part_local.len() + second_part_local.len()) as u64),
|
||||||
&mut second_part_remote,
|
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
assert!(
|
assert!(
|
||||||
metadata.is_none(),
|
second_part_download.metadata.is_none(),
|
||||||
"No metadata should be returned for no metadata upload"
|
"No metadata should be returned for no metadata upload"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||||
|
io::copy(
|
||||||
|
&mut second_part_download.download_stream,
|
||||||
|
&mut second_part_remote,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
second_part_remote.flush().await?;
|
second_part_remote.flush().await?;
|
||||||
let second_part_remote = second_part_remote.into_inner().into_inner();
|
let second_part_remote = second_part_remote.into_inner().into_inner();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -696,11 +714,30 @@ mod fs_tests {
|
|||||||
let upload_name = "upload_1";
|
let upload_name = "upload_1";
|
||||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||||
|
|
||||||
|
let start = 1_000_000_000;
|
||||||
|
let end = start + 1;
|
||||||
|
match storage
|
||||||
|
.download_byte_range(
|
||||||
|
&upload_target,
|
||||||
|
start,
|
||||||
|
Some(end), // exclusive end
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(_) => panic!("Should not allow downloading wrong ranges"),
|
||||||
|
Err(e) => {
|
||||||
|
let error_string = e.to_string();
|
||||||
|
assert!(error_string.contains("zero bytes"));
|
||||||
|
assert!(error_string.contains(&start.to_string()));
|
||||||
|
assert!(error_string.contains(&end.to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let start = 10000;
|
let start = 10000;
|
||||||
let end = 234;
|
let end = 234;
|
||||||
assert!(start > end, "Should test an incorrect range");
|
assert!(start > end, "Should test an incorrect range");
|
||||||
match storage
|
match storage
|
||||||
.download_byte_range(&upload_target, start, Some(end), &mut io::sink())
|
.download_byte_range(&upload_target, start, Some(end))
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(_) => panic!("Should not allow downloading wrong ranges"),
|
Ok(_) => panic!("Should not allow downloading wrong ranges"),
|
||||||
@@ -712,18 +749,6 @@ mod fs_tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let non_existing_path = PathBuf::from("somewhere").join("else");
|
|
||||||
match storage
|
|
||||||
.download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink())
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
|
|
||||||
Err(e) => {
|
|
||||||
let error_string = e.to_string();
|
|
||||||
assert!(error_string.contains("does not exist"));
|
|
||||||
assert!(error_string.contains(&non_existing_path.display().to_string()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -762,35 +787,26 @@ mod fs_tests {
|
|||||||
let upload_target =
|
let upload_target =
|
||||||
upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;
|
upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;
|
||||||
|
|
||||||
let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
let full_range_download_contents =
|
||||||
let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?;
|
read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
|
||||||
|
|
||||||
content_bytes.flush().await?;
|
|
||||||
let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
dummy_contents(upload_name),
|
dummy_contents(upload_name),
|
||||||
contents,
|
full_range_download_contents,
|
||||||
"We should upload and download the same contents"
|
"We should upload and download the same contents"
|
||||||
);
|
);
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
full_download_metadata.as_ref(),
|
|
||||||
Some(&metadata),
|
|
||||||
"We should get the same metadata back for full download"
|
|
||||||
);
|
|
||||||
|
|
||||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||||
let (first_part_local, _) = uploaded_bytes.split_at(3);
|
let (first_part_local, _) = uploaded_bytes.split_at(3);
|
||||||
|
|
||||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
let mut partial_download_with_metadata = storage
|
||||||
let partial_download_metadata = storage
|
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||||
.download_byte_range(
|
|
||||||
&upload_target,
|
|
||||||
0,
|
|
||||||
Some(first_part_local.len() as u64),
|
|
||||||
&mut first_part_remote,
|
|
||||||
)
|
|
||||||
.await?;
|
.await?;
|
||||||
|
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||||
|
io::copy(
|
||||||
|
&mut partial_download_with_metadata.download_stream,
|
||||||
|
&mut first_part_remote,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
first_part_remote.flush().await?;
|
first_part_remote.flush().await?;
|
||||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -800,8 +816,8 @@ mod fs_tests {
|
|||||||
);
|
);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
partial_download_metadata.as_ref(),
|
partial_download_with_metadata.metadata,
|
||||||
Some(&metadata),
|
Some(metadata),
|
||||||
"We should get the same metadata back for partial download"
|
"We should get the same metadata back for partial download"
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -843,7 +859,7 @@ mod fs_tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn dummy_contents(name: &str) -> String {
|
fn dummy_contents(name: &str) -> String {
|
||||||
format!("contents for {}", name)
|
format!("contents for {name}")
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
|
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
|
||||||
|
|||||||
@@ -9,17 +9,19 @@ use std::path::{Path, PathBuf};
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use rusoto_core::{
|
use rusoto_core::{
|
||||||
credential::{InstanceMetadataProvider, StaticProvider},
|
credential::{InstanceMetadataProvider, StaticProvider},
|
||||||
HttpClient, Region,
|
HttpClient, Region, RusotoError,
|
||||||
};
|
};
|
||||||
use rusoto_s3::{
|
use rusoto_s3::{
|
||||||
DeleteObjectRequest, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, S3Client,
|
DeleteObjectRequest, GetObjectError, GetObjectRequest, ListObjectsV2Request, PutObjectRequest,
|
||||||
StreamingBody, S3,
|
S3Client, StreamingBody, S3,
|
||||||
};
|
};
|
||||||
use tokio::{io, sync::Semaphore};
|
use tokio::{io, sync::Semaphore};
|
||||||
use tokio_util::io::ReaderStream;
|
use tokio_util::io::ReaderStream;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::{strip_path_prefix, RemoteStorage, S3Config};
|
use crate::{
|
||||||
|
strip_path_prefix, Download, DownloadError, RemoteObjectName, RemoteStorage, S3Config,
|
||||||
|
};
|
||||||
|
|
||||||
use super::StorageMetadata;
|
use super::StorageMetadata;
|
||||||
|
|
||||||
@@ -117,6 +119,25 @@ impl S3ObjectKey {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl RemoteObjectName for S3ObjectKey {
|
||||||
|
/// Turn a/b/c or a/b/c/ into c
|
||||||
|
fn object_name(&self) -> Option<&str> {
|
||||||
|
// corner case, char::to_string is not const, thats why this is more verbose than it needs to be
|
||||||
|
// see https://github.com/rust-lang/rust/issues/88674
|
||||||
|
if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.0.ends_with(S3_PREFIX_SEPARATOR) {
|
||||||
|
self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1)
|
||||||
|
} else {
|
||||||
|
self.0
|
||||||
|
.rsplit_once(S3_PREFIX_SEPARATOR)
|
||||||
|
.map(|(_, last)| last)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// AWS S3 storage.
|
/// AWS S3 storage.
|
||||||
pub struct S3Bucket {
|
pub struct S3Bucket {
|
||||||
workdir: PathBuf,
|
workdir: PathBuf,
|
||||||
@@ -187,6 +208,39 @@ impl S3Bucket {
|
|||||||
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
|
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||||
|
let _guard = self
|
||||||
|
.concurrency_limiter
|
||||||
|
.acquire()
|
||||||
|
.await
|
||||||
|
.context("Concurrency limiter semaphore got closed during S3 download")
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
metrics::inc_get_object();
|
||||||
|
|
||||||
|
match self.client.get_object(request).await {
|
||||||
|
Ok(object_output) => match object_output.body {
|
||||||
|
None => {
|
||||||
|
metrics::inc_get_object_fail();
|
||||||
|
Err(DownloadError::Other(anyhow::anyhow!(
|
||||||
|
"Got no body for the S3 object given"
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
Some(body) => Ok(Download {
|
||||||
|
metadata: object_output.metadata.map(StorageMetadata),
|
||||||
|
download_stream: Box::pin(io::BufReader::new(body.into_async_read())),
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
Err(RusotoError::Service(GetObjectError::NoSuchKey(_))) => Err(DownloadError::NotFound),
|
||||||
|
Err(e) => {
|
||||||
|
metrics::inc_get_object_fail();
|
||||||
|
Err(DownloadError::Other(anyhow::anyhow!(
|
||||||
|
"Failed to download S3 object: {e}"
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
@@ -250,6 +304,77 @@ impl RemoteStorage for S3Bucket {
|
|||||||
Ok(document_keys)
|
Ok(document_keys)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Note: it wont include empty "directories"
|
||||||
|
async fn list_prefixes(
|
||||||
|
&self,
|
||||||
|
prefix: Option<Self::RemoteObjectId>,
|
||||||
|
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||||
|
let list_prefix = match prefix {
|
||||||
|
Some(prefix) => {
|
||||||
|
let mut prefix_in_bucket = self.prefix_in_bucket.clone().unwrap_or_default();
|
||||||
|
// if there is no trailing / in default prefix and
|
||||||
|
// supplied prefix does not start with "/" insert it
|
||||||
|
if !(prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR)
|
||||||
|
|| prefix.0.starts_with(S3_PREFIX_SEPARATOR))
|
||||||
|
{
|
||||||
|
prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
|
||||||
|
}
|
||||||
|
|
||||||
|
prefix_in_bucket.push_str(&prefix.0);
|
||||||
|
// required to end with a separator
|
||||||
|
// otherwise request will return only the entry of a prefix
|
||||||
|
if !prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) {
|
||||||
|
prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
|
||||||
|
}
|
||||||
|
Some(prefix_in_bucket)
|
||||||
|
}
|
||||||
|
None => self.prefix_in_bucket.clone(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut document_keys = Vec::new();
|
||||||
|
|
||||||
|
let mut continuation_token = None;
|
||||||
|
loop {
|
||||||
|
let _guard = self
|
||||||
|
.concurrency_limiter
|
||||||
|
.acquire()
|
||||||
|
.await
|
||||||
|
.context("Concurrency limiter semaphore got closed during S3 list")?;
|
||||||
|
|
||||||
|
metrics::inc_list_objects();
|
||||||
|
|
||||||
|
let fetch_response = self
|
||||||
|
.client
|
||||||
|
.list_objects_v2(ListObjectsV2Request {
|
||||||
|
bucket: self.bucket_name.clone(),
|
||||||
|
prefix: list_prefix.clone(),
|
||||||
|
continuation_token,
|
||||||
|
delimiter: Some(S3_PREFIX_SEPARATOR.to_string()),
|
||||||
|
..ListObjectsV2Request::default()
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
metrics::inc_list_objects_fail();
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
|
||||||
|
document_keys.extend(
|
||||||
|
fetch_response
|
||||||
|
.common_prefixes
|
||||||
|
.unwrap_or_default()
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|o| Some(S3ObjectKey(o.prefix?))),
|
||||||
|
);
|
||||||
|
|
||||||
|
match fetch_response.continuation_token {
|
||||||
|
Some(new_token) => continuation_token = Some(new_token),
|
||||||
|
None => break,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(document_keys)
|
||||||
|
}
|
||||||
|
|
||||||
async fn upload(
|
async fn upload(
|
||||||
&self,
|
&self,
|
||||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||||
@@ -283,38 +408,13 @@ impl RemoteStorage for S3Bucket {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download(
|
async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError> {
|
||||||
&self,
|
self.download_object(GetObjectRequest {
|
||||||
from: &Self::RemoteObjectId,
|
bucket: self.bucket_name.clone(),
|
||||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
key: from.key().to_owned(),
|
||||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
..GetObjectRequest::default()
|
||||||
let _guard = self
|
})
|
||||||
.concurrency_limiter
|
.await
|
||||||
.acquire()
|
|
||||||
.await
|
|
||||||
.context("Concurrency limiter semaphore got closed during S3 download")?;
|
|
||||||
|
|
||||||
metrics::inc_get_object();
|
|
||||||
|
|
||||||
let object_output = self
|
|
||||||
.client
|
|
||||||
.get_object(GetObjectRequest {
|
|
||||||
bucket: self.bucket_name.clone(),
|
|
||||||
key: from.key().to_owned(),
|
|
||||||
..GetObjectRequest::default()
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.map_err(|e| {
|
|
||||||
metrics::inc_get_object_fail();
|
|
||||||
e
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if let Some(body) = object_output.body {
|
|
||||||
let mut from = io::BufReader::new(body.into_async_read());
|
|
||||||
io::copy(&mut from, to).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(object_output.metadata.map(StorageMetadata))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download_byte_range(
|
async fn download_byte_range(
|
||||||
@@ -322,8 +422,7 @@ impl RemoteStorage for S3Bucket {
|
|||||||
from: &Self::RemoteObjectId,
|
from: &Self::RemoteObjectId,
|
||||||
start_inclusive: u64,
|
start_inclusive: u64,
|
||||||
end_exclusive: Option<u64>,
|
end_exclusive: Option<u64>,
|
||||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
) -> Result<Download, DownloadError> {
|
||||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
|
||||||
// S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
|
// S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
|
||||||
// and needs both ends to be exclusive
|
// and needs both ends to be exclusive
|
||||||
let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
|
let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
|
||||||
@@ -331,34 +430,14 @@ impl RemoteStorage for S3Bucket {
|
|||||||
Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
|
Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
|
||||||
None => format!("bytes={}-", start_inclusive),
|
None => format!("bytes={}-", start_inclusive),
|
||||||
});
|
});
|
||||||
let _guard = self
|
|
||||||
.concurrency_limiter
|
|
||||||
.acquire()
|
|
||||||
.await
|
|
||||||
.context("Concurrency limiter semaphore got closed during S3 range download")?;
|
|
||||||
|
|
||||||
metrics::inc_get_object();
|
self.download_object(GetObjectRequest {
|
||||||
|
bucket: self.bucket_name.clone(),
|
||||||
let object_output = self
|
key: from.key().to_owned(),
|
||||||
.client
|
range,
|
||||||
.get_object(GetObjectRequest {
|
..GetObjectRequest::default()
|
||||||
bucket: self.bucket_name.clone(),
|
})
|
||||||
key: from.key().to_owned(),
|
.await
|
||||||
range,
|
|
||||||
..GetObjectRequest::default()
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.map_err(|e| {
|
|
||||||
metrics::inc_get_object_fail();
|
|
||||||
e
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if let Some(body) = object_output.body {
|
|
||||||
let mut from = io::BufReader::new(body.into_async_read());
|
|
||||||
io::copy(&mut from, to).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(object_output.metadata.map(StorageMetadata))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
|
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
|
||||||
@@ -391,6 +470,25 @@ mod tests {
|
|||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn object_name() {
|
||||||
|
let k = S3ObjectKey("a/b/c".to_owned());
|
||||||
|
assert_eq!(k.object_name(), Some("c"));
|
||||||
|
|
||||||
|
let k = S3ObjectKey("a/b/c/".to_owned());
|
||||||
|
assert_eq!(k.object_name(), Some("c"));
|
||||||
|
|
||||||
|
let k = S3ObjectKey("a/".to_owned());
|
||||||
|
assert_eq!(k.object_name(), Some("a"));
|
||||||
|
|
||||||
|
// XXX is it impossible to have an empty key?
|
||||||
|
let k = S3ObjectKey("".to_owned());
|
||||||
|
assert_eq!(k.object_name(), None);
|
||||||
|
|
||||||
|
let k = S3ObjectKey("/".to_owned());
|
||||||
|
assert_eq!(k.object_name(), None);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn download_destination() -> anyhow::Result<()> {
|
fn download_destination() -> anyhow::Result<()> {
|
||||||
let workdir = tempdir()?.path().to_owned();
|
let workdir = tempdir()?.path().to_owned();
|
||||||
|
|||||||
@@ -13,13 +13,10 @@ use std::fmt;
|
|||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
use std::net::{Shutdown, SocketAddr, TcpStream};
|
use std::net::{Shutdown, SocketAddr, TcpStream};
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
static PGBACKEND_SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
|
|
||||||
|
|
||||||
pub trait Handler {
|
pub trait Handler {
|
||||||
/// Handle single query.
|
/// Handle single query.
|
||||||
/// postgres_backend will issue ReadyForQuery after calling this (this
|
/// postgres_backend will issue ReadyForQuery after calling this (this
|
||||||
@@ -45,6 +42,10 @@ pub trait Handler {
|
|||||||
fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
|
fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
|
||||||
bail!("JWT auth failed")
|
bail!("JWT auth failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_shutdown_requested(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// PostgresBackend protocol state.
|
/// PostgresBackend protocol state.
|
||||||
@@ -274,7 +275,7 @@ impl PostgresBackend {
|
|||||||
|
|
||||||
let mut unnamed_query_string = Bytes::new();
|
let mut unnamed_query_string = Bytes::new();
|
||||||
|
|
||||||
while !PGBACKEND_SHUTDOWN_REQUESTED.load(Ordering::Relaxed) {
|
while !handler.is_shutdown_requested() {
|
||||||
match self.read_message() {
|
match self.read_message() {
|
||||||
Ok(message) => {
|
Ok(message) => {
|
||||||
if let Some(msg) = message {
|
if let Some(msg) = message {
|
||||||
@@ -493,8 +494,3 @@ impl PostgresBackend {
|
|||||||
Ok(ProcessMsgResult::Continue)
|
Ok(ProcessMsgResult::Continue)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the flag to inform connections to cancel
|
|
||||||
pub fn set_pgbackend_shutdown_requested() {
|
|
||||||
PGBACKEND_SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -926,10 +926,10 @@ impl<'a> BeMessage<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Zenith extension of postgres replication protocol
|
// Neon extension of postgres replication protocol
|
||||||
// See ZENITH_STATUS_UPDATE_TAG_BYTE
|
// See NEON_STATUS_UPDATE_TAG_BYTE
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
||||||
pub struct ZenithFeedback {
|
pub struct ReplicationFeedback {
|
||||||
// Last known size of the timeline. Used to enforce timeline size limit.
|
// Last known size of the timeline. Used to enforce timeline size limit.
|
||||||
pub current_timeline_size: u64,
|
pub current_timeline_size: u64,
|
||||||
// Parts of StandbyStatusUpdate we resend to compute via safekeeper
|
// Parts of StandbyStatusUpdate we resend to compute via safekeeper
|
||||||
@@ -939,13 +939,13 @@ pub struct ZenithFeedback {
|
|||||||
pub ps_replytime: SystemTime,
|
pub ps_replytime: SystemTime,
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: Do not forget to increment this number when adding new fields to ZenithFeedback.
|
// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback.
|
||||||
// Do not remove previously available fields because this might be backwards incompatible.
|
// Do not remove previously available fields because this might be backwards incompatible.
|
||||||
pub const ZENITH_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
||||||
|
|
||||||
impl ZenithFeedback {
|
impl ReplicationFeedback {
|
||||||
pub fn empty() -> ZenithFeedback {
|
pub fn empty() -> ReplicationFeedback {
|
||||||
ZenithFeedback {
|
ReplicationFeedback {
|
||||||
current_timeline_size: 0,
|
current_timeline_size: 0,
|
||||||
ps_writelsn: 0,
|
ps_writelsn: 0,
|
||||||
ps_applylsn: 0,
|
ps_applylsn: 0,
|
||||||
@@ -954,7 +954,7 @@ impl ZenithFeedback {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Serialize ZenithFeedback using custom format
|
// Serialize ReplicationFeedback using custom format
|
||||||
// to support protocol extensibility.
|
// to support protocol extensibility.
|
||||||
//
|
//
|
||||||
// Following layout is used:
|
// Following layout is used:
|
||||||
@@ -965,7 +965,7 @@ impl ZenithFeedback {
|
|||||||
// uint32 - value length in bytes
|
// uint32 - value length in bytes
|
||||||
// value itself
|
// value itself
|
||||||
pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
|
pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
|
||||||
buf.put_u8(ZENITH_FEEDBACK_FIELDS_NUMBER); // # of keys
|
buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||||
write_cstr(&Bytes::from("current_timeline_size"), buf)?;
|
write_cstr(&Bytes::from("current_timeline_size"), buf)?;
|
||||||
buf.put_i32(8);
|
buf.put_i32(8);
|
||||||
buf.put_u64(self.current_timeline_size);
|
buf.put_u64(self.current_timeline_size);
|
||||||
@@ -992,9 +992,9 @@ impl ZenithFeedback {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// Deserialize ZenithFeedback message
|
// Deserialize ReplicationFeedback message
|
||||||
pub fn parse(mut buf: Bytes) -> ZenithFeedback {
|
pub fn parse(mut buf: Bytes) -> ReplicationFeedback {
|
||||||
let mut zf = ZenithFeedback::empty();
|
let mut zf = ReplicationFeedback::empty();
|
||||||
let nfields = buf.get_u8();
|
let nfields = buf.get_u8();
|
||||||
let mut i = 0;
|
let mut i = 0;
|
||||||
while i < nfields {
|
while i < nfields {
|
||||||
@@ -1035,14 +1035,14 @@ impl ZenithFeedback {
|
|||||||
_ => {
|
_ => {
|
||||||
let len = buf.get_i32();
|
let len = buf.get_i32();
|
||||||
warn!(
|
warn!(
|
||||||
"ZenithFeedback parse. unknown key {} of len {}. Skip it.",
|
"ReplicationFeedback parse. unknown key {} of len {}. Skip it.",
|
||||||
key, len
|
key, len
|
||||||
);
|
);
|
||||||
buf.advance(len as usize);
|
buf.advance(len as usize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
trace!("ZenithFeedback parsed is {:?}", zf);
|
trace!("ReplicationFeedback parsed is {:?}", zf);
|
||||||
zf
|
zf
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1052,8 +1052,8 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_zenithfeedback_serialization() {
|
fn test_replication_feedback_serialization() {
|
||||||
let mut zf = ZenithFeedback::empty();
|
let mut zf = ReplicationFeedback::empty();
|
||||||
// Fill zf with some values
|
// Fill zf with some values
|
||||||
zf.current_timeline_size = 12345678;
|
zf.current_timeline_size = 12345678;
|
||||||
// Set rounded time to be able to compare it with deserialized value,
|
// Set rounded time to be able to compare it with deserialized value,
|
||||||
@@ -1062,13 +1062,13 @@ mod tests {
|
|||||||
let mut data = BytesMut::new();
|
let mut data = BytesMut::new();
|
||||||
zf.serialize(&mut data).unwrap();
|
zf.serialize(&mut data).unwrap();
|
||||||
|
|
||||||
let zf_parsed = ZenithFeedback::parse(data.freeze());
|
let zf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||||
assert_eq!(zf, zf_parsed);
|
assert_eq!(zf, zf_parsed);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_zenithfeedback_unknown_key() {
|
fn test_replication_feedback_unknown_key() {
|
||||||
let mut zf = ZenithFeedback::empty();
|
let mut zf = ReplicationFeedback::empty();
|
||||||
// Fill zf with some values
|
// Fill zf with some values
|
||||||
zf.current_timeline_size = 12345678;
|
zf.current_timeline_size = 12345678;
|
||||||
// Set rounded time to be able to compare it with deserialized value,
|
// Set rounded time to be able to compare it with deserialized value,
|
||||||
@@ -1079,7 +1079,7 @@ mod tests {
|
|||||||
|
|
||||||
// Add an extra field to the buffer and adjust number of keys
|
// Add an extra field to the buffer and adjust number of keys
|
||||||
if let Some(first) = data.first_mut() {
|
if let Some(first) = data.first_mut() {
|
||||||
*first = ZENITH_FEEDBACK_FIELDS_NUMBER + 1;
|
*first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap();
|
write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap();
|
||||||
@@ -1087,7 +1087,7 @@ mod tests {
|
|||||||
data.put_u64(42);
|
data.put_u64(42);
|
||||||
|
|
||||||
// Parse serialized data and check that new field is not parsed
|
// Parse serialized data and check that new field is not parsed
|
||||||
let zf_parsed = ZenithFeedback::parse(data.freeze());
|
let zf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||||
assert_eq!(zf, zf_parsed);
|
assert_eq!(zf, zf_parsed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ use safekeeper::defaults::{
|
|||||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||||
};
|
};
|
||||||
use std::collections::{BTreeSet, HashMap};
|
use std::collections::{BTreeSet, HashMap};
|
||||||
use std::path::Path;
|
use std::path::{Path, PathBuf};
|
||||||
use std::process::exit;
|
use std::process::exit;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use utils::{
|
use utils::{
|
||||||
@@ -159,6 +159,20 @@ fn main() -> Result<()> {
|
|||||||
.about("Create a new blank timeline")
|
.about("Create a new blank timeline")
|
||||||
.arg(tenant_id_arg.clone())
|
.arg(tenant_id_arg.clone())
|
||||||
.arg(branch_name_arg.clone()))
|
.arg(branch_name_arg.clone()))
|
||||||
|
.subcommand(App::new("import")
|
||||||
|
.about("Import timeline from basebackup directory")
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(timeline_id_arg.clone())
|
||||||
|
.arg(Arg::new("node-name").long("node-name").takes_value(true)
|
||||||
|
.help("Name to assign to the imported timeline"))
|
||||||
|
.arg(Arg::new("base-tarfile").long("base-tarfile").takes_value(true)
|
||||||
|
.help("Basebackup tarfile to import"))
|
||||||
|
.arg(Arg::new("base-lsn").long("base-lsn").takes_value(true)
|
||||||
|
.help("Lsn the basebackup starts at"))
|
||||||
|
.arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true)
|
||||||
|
.help("Wal to add after base"))
|
||||||
|
.arg(Arg::new("end-lsn").long("end-lsn").takes_value(true)
|
||||||
|
.help("Lsn the basebackup ends at")))
|
||||||
).subcommand(
|
).subcommand(
|
||||||
App::new("tenant")
|
App::new("tenant")
|
||||||
.setting(AppSettings::ArgRequiredElseHelp)
|
.setting(AppSettings::ArgRequiredElseHelp)
|
||||||
@@ -523,7 +537,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
|||||||
match tenant_match.subcommand() {
|
match tenant_match.subcommand() {
|
||||||
Some(("list", _)) => {
|
Some(("list", _)) => {
|
||||||
for t in pageserver.tenant_list()? {
|
for t in pageserver.tenant_list()? {
|
||||||
println!("{} {}", t.id, t.state);
|
println!(
|
||||||
|
"{} {}",
|
||||||
|
t.id,
|
||||||
|
t.state
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.unwrap_or_else(|| String::from(""))
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some(("create", create_match)) => {
|
Some(("create", create_match)) => {
|
||||||
@@ -613,6 +633,43 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
|||||||
timeline.timeline_id, last_record_lsn, tenant_id,
|
timeline.timeline_id, last_record_lsn, tenant_id,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
Some(("import", import_match)) => {
|
||||||
|
let tenant_id = get_tenant_id(import_match, env)?;
|
||||||
|
let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided");
|
||||||
|
let name = import_match
|
||||||
|
.value_of("node-name")
|
||||||
|
.ok_or_else(|| anyhow!("No node name provided"))?;
|
||||||
|
|
||||||
|
// Parse base inputs
|
||||||
|
let base_tarfile = import_match
|
||||||
|
.value_of("base-tarfile")
|
||||||
|
.map(|s| PathBuf::from_str(s).unwrap())
|
||||||
|
.ok_or_else(|| anyhow!("No base-tarfile provided"))?;
|
||||||
|
let base_lsn = Lsn::from_str(
|
||||||
|
import_match
|
||||||
|
.value_of("base-lsn")
|
||||||
|
.ok_or_else(|| anyhow!("No base-lsn provided"))?,
|
||||||
|
)?;
|
||||||
|
let base = (base_lsn, base_tarfile);
|
||||||
|
|
||||||
|
// Parse pg_wal inputs
|
||||||
|
let wal_tarfile = import_match
|
||||||
|
.value_of("wal-tarfile")
|
||||||
|
.map(|s| PathBuf::from_str(s).unwrap());
|
||||||
|
let end_lsn = import_match
|
||||||
|
.value_of("end-lsn")
|
||||||
|
.map(|s| Lsn::from_str(s).unwrap());
|
||||||
|
// TODO validate both or none are provided
|
||||||
|
let pg_wal = end_lsn.zip(wal_tarfile);
|
||||||
|
|
||||||
|
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||||
|
println!("Importing timeline into pageserver ...");
|
||||||
|
pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal)?;
|
||||||
|
println!("Creating node for imported timeline ...");
|
||||||
|
env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
|
||||||
|
cplane.new_node(tenant_id, name, timeline_id, None, None)?;
|
||||||
|
println!("Done");
|
||||||
|
}
|
||||||
Some(("branch", branch_match)) => {
|
Some(("branch", branch_match)) => {
|
||||||
let tenant_id = get_tenant_id(branch_match, env)?;
|
let tenant_id = get_tenant_id(branch_match, env)?;
|
||||||
let new_branch_name = branch_match
|
let new_branch_name = branch_match
|
||||||
|
|||||||
@@ -61,6 +61,7 @@ utils = { path = "../libs/utils" }
|
|||||||
remote_storage = { path = "../libs/remote_storage" }
|
remote_storage = { path = "../libs/remote_storage" }
|
||||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
close_fds = "0.3.2"
|
close_fds = "0.3.2"
|
||||||
|
walkdir = "2.3.2"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
hex-literal = "0.3"
|
hex-literal = "0.3"
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ Repository
|
|||||||
|
|
||||||
The repository stores all the page versions, or WAL records needed to
|
The repository stores all the page versions, or WAL records needed to
|
||||||
reconstruct them. Each tenant has a separate Repository, which is
|
reconstruct them. Each tenant has a separate Repository, which is
|
||||||
stored in the .zenith/tenants/<tenantid> directory.
|
stored in the .neon/tenants/<tenantid> directory.
|
||||||
|
|
||||||
Repository is an abstract trait, defined in `repository.rs`. It is
|
Repository is an abstract trait, defined in `repository.rs`. It is
|
||||||
implemented by the LayeredRepository object in
|
implemented by the LayeredRepository object in
|
||||||
@@ -92,7 +92,7 @@ Each repository also has a WAL redo manager associated with it, see
|
|||||||
records, whenever we need to reconstruct a page version from WAL to
|
records, whenever we need to reconstruct a page version from WAL to
|
||||||
satisfy a GetPage@LSN request, or to avoid accumulating too much WAL
|
satisfy a GetPage@LSN request, or to avoid accumulating too much WAL
|
||||||
for a page. The WAL redo manager uses a Postgres process running in
|
for a page. The WAL redo manager uses a Postgres process running in
|
||||||
special zenith wal-redo mode to do the actual WAL redo, and
|
special Neon wal-redo mode to do the actual WAL redo, and
|
||||||
communicates with the process using a pipe.
|
communicates with the process using a pipe.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||||
use bytes::{BufMut, BytesMut};
|
use bytes::{BufMut, BytesMut};
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
|
use itertools::Itertools;
|
||||||
use std::fmt::Write as FmtWrite;
|
use std::fmt::Write as FmtWrite;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
@@ -21,7 +22,7 @@ use std::time::SystemTime;
|
|||||||
use tar::{Builder, EntryType, Header};
|
use tar::{Builder, EntryType, Header};
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
use crate::reltag::SlruKind;
|
use crate::reltag::{RelTag, SlruKind};
|
||||||
use crate::repository::Timeline;
|
use crate::repository::Timeline;
|
||||||
use crate::DatadirTimelineImpl;
|
use crate::DatadirTimelineImpl;
|
||||||
use postgres_ffi::xlog_utils::*;
|
use postgres_ffi::xlog_utils::*;
|
||||||
@@ -39,11 +40,12 @@ where
|
|||||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||||
pub lsn: Lsn,
|
pub lsn: Lsn,
|
||||||
prev_record_lsn: Lsn,
|
prev_record_lsn: Lsn,
|
||||||
|
full_backup: bool,
|
||||||
finished: bool,
|
finished: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create basebackup with non-rel data in it. Omit relational data.
|
// Create basebackup with non-rel data in it.
|
||||||
|
// Only include relational data if 'full_backup' is true.
|
||||||
//
|
//
|
||||||
// Currently we use empty lsn in two cases:
|
// Currently we use empty lsn in two cases:
|
||||||
// * During the basebackup right after timeline creation
|
// * During the basebackup right after timeline creation
|
||||||
@@ -58,6 +60,8 @@ where
|
|||||||
write: W,
|
write: W,
|
||||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||||
req_lsn: Option<Lsn>,
|
req_lsn: Option<Lsn>,
|
||||||
|
prev_lsn: Option<Lsn>,
|
||||||
|
full_backup: bool,
|
||||||
) -> Result<Basebackup<'a, W>> {
|
) -> Result<Basebackup<'a, W>> {
|
||||||
// Compute postgres doesn't have any previous WAL files, but the first
|
// Compute postgres doesn't have any previous WAL files, but the first
|
||||||
// record that it's going to write needs to include the LSN of the
|
// record that it's going to write needs to include the LSN of the
|
||||||
@@ -93,21 +97,34 @@ where
|
|||||||
(end_of_timeline.prev, end_of_timeline.last)
|
(end_of_timeline.prev, end_of_timeline.last)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Consolidate the derived and the provided prev_lsn values
|
||||||
|
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||||
|
if backup_prev != Lsn(0) {
|
||||||
|
ensure!(backup_prev == provided_prev_lsn)
|
||||||
|
}
|
||||||
|
provided_prev_lsn
|
||||||
|
} else {
|
||||||
|
backup_prev
|
||||||
|
};
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"taking basebackup lsn={}, prev_lsn={}",
|
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
|
||||||
backup_lsn, backup_prev
|
backup_lsn, prev_lsn, full_backup
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(Basebackup {
|
Ok(Basebackup {
|
||||||
ar: Builder::new(AbortableWrite::new(write)),
|
ar: Builder::new(AbortableWrite::new(write)),
|
||||||
timeline,
|
timeline,
|
||||||
lsn: backup_lsn,
|
lsn: backup_lsn,
|
||||||
prev_record_lsn: backup_prev,
|
prev_record_lsn: prev_lsn,
|
||||||
|
full_backup,
|
||||||
finished: false,
|
finished: false,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn send_tarball(mut self) -> anyhow::Result<()> {
|
pub fn send_tarball(mut self) -> anyhow::Result<()> {
|
||||||
|
// TODO include checksum
|
||||||
|
|
||||||
// Create pgdata subdirs structure
|
// Create pgdata subdirs structure
|
||||||
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
||||||
let header = new_tar_header_dir(*dir)?;
|
let header = new_tar_header_dir(*dir)?;
|
||||||
@@ -140,6 +157,13 @@ where
|
|||||||
// Create tablespace directories
|
// Create tablespace directories
|
||||||
for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
|
for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
|
||||||
self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
|
self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
|
||||||
|
|
||||||
|
// Gather and send relational files in each database if full backup is requested.
|
||||||
|
if self.full_backup {
|
||||||
|
for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? {
|
||||||
|
self.add_rel(rel)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for xid in self.timeline.list_twophase_files(self.lsn)? {
|
for xid in self.timeline.list_twophase_files(self.lsn)? {
|
||||||
self.add_twophase_file(xid)?;
|
self.add_twophase_file(xid)?;
|
||||||
@@ -157,6 +181,38 @@ where
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
|
||||||
|
let nblocks = self.timeline.get_rel_size(tag, self.lsn)?;
|
||||||
|
|
||||||
|
// Function that adds relation segment data to archive
|
||||||
|
let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
|
||||||
|
let file_name = tag.to_segfile_name(segment_index as u32);
|
||||||
|
let header = new_tar_header(&file_name, data.len() as u64)?;
|
||||||
|
self.ar.append(&header, data.as_slice())?;
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
|
||||||
|
// If the relation is empty, create an empty file
|
||||||
|
if nblocks == 0 {
|
||||||
|
add_file(0, &vec![])?;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a file for each chunk of blocks (aka segment)
|
||||||
|
let chunks = (0..nblocks).chunks(pg_constants::RELSEG_SIZE as usize);
|
||||||
|
for (seg, blocks) in chunks.into_iter().enumerate() {
|
||||||
|
let mut segment_data: Vec<u8> = vec![];
|
||||||
|
for blknum in blocks {
|
||||||
|
let img = self.timeline.get_rel_page_at_lsn(tag, blknum, self.lsn)?;
|
||||||
|
segment_data.extend_from_slice(&img[..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
add_file(seg, &segment_data)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Generate SLRU segment files from repository.
|
// Generate SLRU segment files from repository.
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".neon"));
|
||||||
let workdir = workdir
|
let workdir = workdir
|
||||||
.canonicalize()
|
.canonicalize()
|
||||||
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
|
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
|
||||||
@@ -263,6 +263,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
|||||||
// start profiler (if enabled)
|
// start profiler (if enabled)
|
||||||
let profiler_guard = profiling::init_profiler(conf);
|
let profiler_guard = profiling::init_profiler(conf);
|
||||||
|
|
||||||
|
pageserver::tenant_tasks::init_tenant_task_pool()?;
|
||||||
|
|
||||||
// initialize authentication for incoming connections
|
// initialize authentication for incoming connections
|
||||||
let auth = match &conf.auth_type {
|
let auth = match &conf.auth_type {
|
||||||
AuthType::Trust | AuthType::MD5 => None,
|
AuthType::Trust | AuthType::MD5 => None,
|
||||||
|
|||||||
@@ -22,6 +22,49 @@ paths:
|
|||||||
properties:
|
properties:
|
||||||
id:
|
id:
|
||||||
type: integer
|
type: integer
|
||||||
|
|
||||||
|
/v1/tenant/{tenant_id}:
|
||||||
|
parameters:
|
||||||
|
- name: tenant_id
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
|
get:
|
||||||
|
description: Get tenant status
|
||||||
|
responses:
|
||||||
|
"200":
|
||||||
|
description: Currently returns the flag whether the tenant has inprogress timeline downloads
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/TenantInfo"
|
||||||
|
"400":
|
||||||
|
description: Error when no tenant id found in path or no timeline id
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
"401":
|
||||||
|
description: Unauthorized Error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/UnauthorizedError"
|
||||||
|
"403":
|
||||||
|
description: Forbidden Error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ForbiddenError"
|
||||||
|
"500":
|
||||||
|
description: Generic operation error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline:
|
/v1/tenant/{tenant_id}/timeline:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_id
|
||||||
@@ -70,6 +113,7 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/Error"
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_id
|
||||||
@@ -84,13 +128,14 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
format: hex
|
||||||
- name: include-non-incremental-logical-size
|
|
||||||
in: query
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
description: Controls calculation of current_logical_size_non_incremental
|
|
||||||
get:
|
get:
|
||||||
description: Get info about the timeline
|
description: Get info about the timeline
|
||||||
|
parameters:
|
||||||
|
- name: include-non-incremental-logical-size
|
||||||
|
in: query
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
description: Controls calculation of current_logical_size_non_incremental
|
||||||
responses:
|
responses:
|
||||||
"200":
|
"200":
|
||||||
description: TimelineInfo
|
description: TimelineInfo
|
||||||
@@ -122,6 +167,35 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/Error"
|
$ref: "#/components/schemas/Error"
|
||||||
|
delete:
|
||||||
|
description: "Attempts to delete specified timeline. On 500 errors should be retried"
|
||||||
|
responses:
|
||||||
|
"200":
|
||||||
|
description: Ok
|
||||||
|
"400":
|
||||||
|
description: Error when no tenant id found in path or no timeline id
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
"401":
|
||||||
|
description: Unauthorized Error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/UnauthorizedError"
|
||||||
|
"403":
|
||||||
|
description: Forbidden Error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ForbiddenError"
|
||||||
|
"500":
|
||||||
|
description: Generic operation error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver:
|
/v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -171,7 +245,7 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/Error"
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/attach:
|
/v1/tenant/{tenant_id}/attach:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_id
|
||||||
in: path
|
in: path
|
||||||
@@ -179,19 +253,13 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
format: hex
|
||||||
- name: timeline_id
|
|
||||||
in: path
|
|
||||||
required: true
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
format: hex
|
|
||||||
post:
|
post:
|
||||||
description: Attach remote timeline
|
description: Schedules attach operation to happen in the background for given tenant
|
||||||
responses:
|
responses:
|
||||||
"200":
|
"202":
|
||||||
description: Timeline attaching scheduled
|
description: Tenant attaching scheduled
|
||||||
"400":
|
"400":
|
||||||
description: Error when no tenant id found in path or no timeline id
|
description: Error when no tenant id found in path parameters
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
@@ -215,7 +283,7 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/NotFoundError"
|
$ref: "#/components/schemas/NotFoundError"
|
||||||
"409":
|
"409":
|
||||||
description: Timeline download is already in progress
|
description: Tenant download is already in progress
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
@@ -227,7 +295,6 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/Error"
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach:
|
/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_id
|
||||||
@@ -243,10 +310,11 @@ paths:
|
|||||||
type: string
|
type: string
|
||||||
format: hex
|
format: hex
|
||||||
post:
|
post:
|
||||||
description: Detach local timeline
|
description: Deprecated, use DELETE /v1/tenant/{tenant_id}/timeline/{timeline_id} instead
|
||||||
|
deprecated: true
|
||||||
responses:
|
responses:
|
||||||
"200":
|
"200":
|
||||||
description: Timeline detached
|
description: Ok
|
||||||
"400":
|
"400":
|
||||||
description: Error when no tenant id found in path or no timeline id
|
description: Error when no tenant id found in path or no timeline id
|
||||||
content:
|
content:
|
||||||
@@ -272,6 +340,43 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/Error"
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
|
/v1/tenant/{tenant_id}/detach:
|
||||||
|
parameters:
|
||||||
|
- name: tenant_id
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
|
post:
|
||||||
|
description: Detach local tenant
|
||||||
|
responses:
|
||||||
|
"200":
|
||||||
|
description: Tenant detached
|
||||||
|
"400":
|
||||||
|
description: Error when no tenant id found in path parameters
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
"401":
|
||||||
|
description: Unauthorized Error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/UnauthorizedError"
|
||||||
|
"403":
|
||||||
|
description: Forbidden Error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ForbiddenError"
|
||||||
|
"500":
|
||||||
|
description: Generic operation error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/:
|
/v1/tenant/{tenant_id}/timeline/:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -467,12 +572,13 @@ components:
|
|||||||
type: object
|
type: object
|
||||||
required:
|
required:
|
||||||
- id
|
- id
|
||||||
- state
|
|
||||||
properties:
|
properties:
|
||||||
id:
|
id:
|
||||||
type: string
|
type: string
|
||||||
state:
|
state:
|
||||||
type: string
|
type: string
|
||||||
|
has_in_progress_downloads:
|
||||||
|
type: boolean
|
||||||
TenantCreateInfo:
|
TenantCreateInfo:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
@@ -567,6 +673,7 @@ components:
|
|||||||
type: integer
|
type: integer
|
||||||
current_logical_size_non_incremental:
|
current_logical_size_non_incremental:
|
||||||
type: integer
|
type: integer
|
||||||
|
|
||||||
WalReceiverEntry:
|
WalReceiverEntry:
|
||||||
type: object
|
type: object
|
||||||
required:
|
required:
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ use crate::repository::Repository;
|
|||||||
use crate::storage_sync;
|
use crate::storage_sync;
|
||||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
|
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
|
||||||
use crate::tenant_config::TenantConfOpt;
|
use crate::tenant_config::TenantConfOpt;
|
||||||
|
use crate::tenant_mgr::TenantInfo;
|
||||||
use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
|
use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
|
||||||
use crate::{config::PageServerConf, tenant_mgr, timelines};
|
use crate::{config::PageServerConf, tenant_mgr, timelines};
|
||||||
use utils::{
|
use utils::{
|
||||||
@@ -209,9 +210,9 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
|||||||
.await;
|
.await;
|
||||||
|
|
||||||
if local_timeline_info.is_none() && remote_timeline_info.is_none() {
|
if local_timeline_info.is_none() && remote_timeline_info.is_none() {
|
||||||
return Err(ApiError::NotFound(
|
return Err(ApiError::NotFound(format!(
|
||||||
"Timeline is not found neither locally nor remotely".to_string(),
|
"Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely"
|
||||||
));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
let timeline_info = TimelineInfo {
|
let timeline_info = TimelineInfo {
|
||||||
@@ -241,123 +242,157 @@ async fn wal_receiver_get_handler(request: Request<Body>) -> Result<Response<Bod
|
|||||||
json_response(StatusCode::OK, &wal_receiver_entry)
|
json_response(StatusCode::OK, &wal_receiver_entry)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create
|
||||||
|
async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
info!("Handling tenant attach {}", tenant_id,);
|
||||||
info!(
|
|
||||||
"Handling timeline {} attach for tenant: {}",
|
|
||||||
timeline_id, tenant_id,
|
|
||||||
);
|
|
||||||
|
|
||||||
tokio::task::spawn_blocking(move || {
|
tokio::task::spawn_blocking(move || {
|
||||||
if tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).is_ok() {
|
if tenant_mgr::get_tenant_state(tenant_id).is_some() {
|
||||||
// TODO: maybe answer with 309 Not Modified here?
|
anyhow::bail!("Tenant is already present locally")
|
||||||
anyhow::bail!("Timeline is already present locally")
|
|
||||||
};
|
};
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::from_err)??;
|
.map_err(ApiError::from_err)??;
|
||||||
|
|
||||||
let sync_id = ZTenantTimelineId {
|
|
||||||
tenant_id,
|
|
||||||
timeline_id,
|
|
||||||
};
|
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
let remote_index = &state.remote_index;
|
let remote_index = &state.remote_index;
|
||||||
|
|
||||||
let mut index_accessor = remote_index.write().await;
|
let mut index_accessor = remote_index.write().await;
|
||||||
if let Some(remote_timeline) = index_accessor.timeline_entry_mut(&sync_id) {
|
if let Some(tenant_entry) = index_accessor.tenant_entry_mut(&tenant_id) {
|
||||||
if remote_timeline.awaits_download {
|
if tenant_entry.has_in_progress_downloads() {
|
||||||
return Err(ApiError::Conflict(
|
return Err(ApiError::Conflict(
|
||||||
"Timeline download is already in progress".to_string(),
|
"Tenant download is already in progress".to_string(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
remote_timeline.awaits_download = true;
|
for (timeline_id, remote_timeline) in tenant_entry.iter_mut() {
|
||||||
storage_sync::schedule_layer_download(tenant_id, timeline_id);
|
storage_sync::schedule_layer_download(tenant_id, *timeline_id);
|
||||||
return json_response(StatusCode::ACCEPTED, ());
|
remote_timeline.awaits_download = true;
|
||||||
} else {
|
|
||||||
// no timeline in the index, release the lock to make the potentially lengthy download opetation
|
|
||||||
drop(index_accessor);
|
|
||||||
}
|
|
||||||
|
|
||||||
let new_timeline = match try_download_index_part_data(state, sync_id).await {
|
|
||||||
Ok(Some(mut new_timeline)) => {
|
|
||||||
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
|
|
||||||
.await
|
|
||||||
.context("Failed to create new timeline directory")?;
|
|
||||||
new_timeline.awaits_download = true;
|
|
||||||
new_timeline
|
|
||||||
}
|
}
|
||||||
Ok(None) => return Err(ApiError::NotFound("Unknown remote timeline".to_string())),
|
return json_response(StatusCode::ACCEPTED, ());
|
||||||
|
}
|
||||||
|
// no tenant in the index, release the lock to make the potentially lengthy download opetation
|
||||||
|
drop(index_accessor);
|
||||||
|
|
||||||
|
// download index parts for every tenant timeline
|
||||||
|
let remote_timelines = match gather_tenant_timelines_index_parts(state, tenant_id).await {
|
||||||
|
Ok(Some(remote_timelines)) => remote_timelines,
|
||||||
|
Ok(None) => return Err(ApiError::NotFound("Unknown remote tenant".to_string())),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Failed to retrieve remote timeline data: {:?}", e);
|
error!("Failed to retrieve remote tenant data: {:?}", e);
|
||||||
return Err(ApiError::NotFound(
|
return Err(ApiError::NotFound(
|
||||||
"Failed to retrieve remote timeline".to_string(),
|
"Failed to retrieve remote tenant".to_string(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// recheck that download is not in progress because
|
||||||
|
// we've released the lock to avoid holding it during the download
|
||||||
let mut index_accessor = remote_index.write().await;
|
let mut index_accessor = remote_index.write().await;
|
||||||
match index_accessor.timeline_entry_mut(&sync_id) {
|
let tenant_entry = match index_accessor.tenant_entry_mut(&tenant_id) {
|
||||||
Some(remote_timeline) => {
|
Some(tenant_entry) => {
|
||||||
if remote_timeline.awaits_download {
|
if tenant_entry.has_in_progress_downloads() {
|
||||||
return Err(ApiError::Conflict(
|
return Err(ApiError::Conflict(
|
||||||
"Timeline download is already in progress".to_string(),
|
"Tenant download is already in progress".to_string(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
remote_timeline.awaits_download = true;
|
tenant_entry
|
||||||
}
|
}
|
||||||
None => index_accessor.add_timeline_entry(sync_id, new_timeline),
|
None => index_accessor.add_tenant_entry(tenant_id),
|
||||||
|
};
|
||||||
|
|
||||||
|
// populate remote index with the data from index part and create directories on the local filesystem
|
||||||
|
for (timeline_id, mut remote_timeline) in remote_timelines {
|
||||||
|
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
|
||||||
|
.await
|
||||||
|
.context("Failed to create new timeline directory")?;
|
||||||
|
|
||||||
|
remote_timeline.awaits_download = true;
|
||||||
|
tenant_entry.insert(timeline_id, remote_timeline);
|
||||||
|
// schedule actual download
|
||||||
|
storage_sync::schedule_layer_download(tenant_id, timeline_id);
|
||||||
}
|
}
|
||||||
storage_sync::schedule_layer_download(tenant_id, timeline_id);
|
|
||||||
json_response(StatusCode::ACCEPTED, ())
|
json_response(StatusCode::ACCEPTED, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn try_download_index_part_data(
|
/// Note: is expensive from s3 access perspective,
|
||||||
|
/// for details see comment to `storage_sync::gather_tenant_timelines_index_parts`
|
||||||
|
async fn gather_tenant_timelines_index_parts(
|
||||||
state: &State,
|
state: &State,
|
||||||
sync_id: ZTenantTimelineId,
|
tenant_id: ZTenantId,
|
||||||
) -> anyhow::Result<Option<RemoteTimeline>> {
|
) -> anyhow::Result<Option<Vec<(ZTimelineId, RemoteTimeline)>>> {
|
||||||
let index_part = match state.remote_storage.as_ref() {
|
let index_parts = match state.remote_storage.as_ref() {
|
||||||
Some(GenericRemoteStorage::Local(local_storage)) => {
|
Some(GenericRemoteStorage::Local(local_storage)) => {
|
||||||
storage_sync::download_index_part(state.conf, local_storage, sync_id).await
|
storage_sync::gather_tenant_timelines_index_parts(state.conf, local_storage, tenant_id)
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
|
// FIXME here s3 storage contains its own limits, that are separate from sync storage thread ones
|
||||||
|
// because it is a different instance. We can move this limit to some global static
|
||||||
|
// or use one instance everywhere.
|
||||||
Some(GenericRemoteStorage::S3(s3_storage)) => {
|
Some(GenericRemoteStorage::S3(s3_storage)) => {
|
||||||
storage_sync::download_index_part(state.conf, s3_storage, sync_id).await
|
storage_sync::gather_tenant_timelines_index_parts(state.conf, s3_storage, tenant_id)
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
None => return Ok(None),
|
None => return Ok(None),
|
||||||
}
|
}
|
||||||
.with_context(|| format!("Failed to download index part for timeline {sync_id}"))?;
|
.with_context(|| format!("Failed to download index parts for tenant {tenant_id}"))?;
|
||||||
|
|
||||||
let timeline_path = state
|
let mut remote_timelines = Vec::with_capacity(index_parts.len());
|
||||||
.conf
|
for (timeline_id, index_part) in index_parts {
|
||||||
.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
|
let timeline_path = state.conf.timeline_path(&timeline_id, &tenant_id);
|
||||||
RemoteTimeline::from_index_part(&timeline_path, index_part)
|
let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||||
.map(Some)
|
.with_context(|| {
|
||||||
.with_context(|| {
|
format!("Failed to convert index part into remote timeline for timeline {tenant_id}/{timeline_id}")
|
||||||
format!("Failed to convert index part into remote timeline for timeline {sync_id}")
|
})?;
|
||||||
})
|
remote_timelines.push((timeline_id, remote_timeline));
|
||||||
|
}
|
||||||
|
Ok(Some(remote_timelines))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
|
|
||||||
|
let state = get_state(&request);
|
||||||
tokio::task::spawn_blocking(move || {
|
tokio::task::spawn_blocking(move || {
|
||||||
let _enter =
|
let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered();
|
||||||
info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id)
|
tenant_mgr::delete_timeline(tenant_id, timeline_id)
|
||||||
.entered();
|
|
||||||
let state = get_state(&request);
|
|
||||||
tenant_mgr::detach_timeline(state.conf, tenant_id, timeline_id)
|
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::from_err)??;
|
.map_err(ApiError::from_err)??;
|
||||||
|
|
||||||
|
let mut remote_index = state.remote_index.write().await;
|
||||||
|
remote_index.remove_timeline_entry(ZTenantTimelineId {
|
||||||
|
tenant_id,
|
||||||
|
timeline_id,
|
||||||
|
});
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
|
let state = get_state(&request);
|
||||||
|
let conf = state.conf;
|
||||||
|
tokio::task::spawn_blocking(move || {
|
||||||
|
let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered();
|
||||||
|
tenant_mgr::detach_tenant(conf, tenant_id)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::from_err)??;
|
||||||
|
|
||||||
|
let mut remote_index = state.remote_index.write().await;
|
||||||
|
remote_index.remove_tenant_entry(&tenant_id);
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -365,9 +400,13 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
|||||||
// check for management permission
|
// check for management permission
|
||||||
check_permission(&request, None)?;
|
check_permission(&request, None)?;
|
||||||
|
|
||||||
|
let state = get_state(&request);
|
||||||
|
// clone to avoid holding the lock while awaiting for blocking task
|
||||||
|
let remote_index = state.remote_index.read().await.clone();
|
||||||
|
|
||||||
let response_data = tokio::task::spawn_blocking(move || {
|
let response_data = tokio::task::spawn_blocking(move || {
|
||||||
let _enter = info_span!("tenant_list").entered();
|
let _enter = info_span!("tenant_list").entered();
|
||||||
crate::tenant_mgr::list_tenants()
|
crate::tenant_mgr::list_tenants(&remote_index)
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::from_err)?;
|
.map_err(ApiError::from_err)?;
|
||||||
@@ -375,6 +414,34 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
|||||||
json_response(StatusCode::OK, response_data)
|
json_response(StatusCode::OK, response_data)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
|
// if tenant is in progress of downloading it can be absent in global tenant map
|
||||||
|
let tenant_state = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant_state(tenant_id))
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::from_err)?;
|
||||||
|
|
||||||
|
let state = get_state(&request);
|
||||||
|
let remote_index = &state.remote_index;
|
||||||
|
|
||||||
|
let index_accessor = remote_index.read().await;
|
||||||
|
let has_in_progress_downloads = index_accessor
|
||||||
|
.tenant_entry(&tenant_id)
|
||||||
|
.ok_or_else(|| ApiError::NotFound("Tenant not found in remote index".to_string()))?
|
||||||
|
.has_in_progress_downloads();
|
||||||
|
|
||||||
|
json_response(
|
||||||
|
StatusCode::OK,
|
||||||
|
TenantInfo {
|
||||||
|
id: tenant_id,
|
||||||
|
state: tenant_state,
|
||||||
|
has_in_progress_downloads: Some(has_in_progress_downloads),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
// check for management permission
|
// check for management permission
|
||||||
check_permission(&request, None)?;
|
check_permission(&request, None)?;
|
||||||
@@ -520,24 +587,28 @@ pub fn make_router(
|
|||||||
.get("/v1/status", status_handler)
|
.get("/v1/status", status_handler)
|
||||||
.get("/v1/tenant", tenant_list_handler)
|
.get("/v1/tenant", tenant_list_handler)
|
||||||
.post("/v1/tenant", tenant_create_handler)
|
.post("/v1/tenant", tenant_create_handler)
|
||||||
|
.get("/v1/tenant/:tenant_id", tenant_status)
|
||||||
.put("/v1/tenant/config", tenant_config_handler)
|
.put("/v1/tenant/config", tenant_config_handler)
|
||||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||||
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
||||||
|
.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
|
||||||
|
.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
|
||||||
.get(
|
.get(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||||
timeline_detail_handler,
|
timeline_detail_handler,
|
||||||
)
|
)
|
||||||
|
.delete(
|
||||||
|
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||||
|
timeline_delete_handler,
|
||||||
|
)
|
||||||
|
// for backward compatibility
|
||||||
|
.post(
|
||||||
|
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
|
||||||
|
timeline_delete_handler,
|
||||||
|
)
|
||||||
.get(
|
.get(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver",
|
"/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver",
|
||||||
wal_receiver_get_handler,
|
wal_receiver_get_handler,
|
||||||
)
|
)
|
||||||
.post(
|
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/attach",
|
|
||||||
timeline_attach_handler,
|
|
||||||
)
|
|
||||||
.post(
|
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
|
|
||||||
timeline_detach_handler,
|
|
||||||
)
|
|
||||||
.any(handler_404))
|
.any(handler_404))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||||
//! a zenith Timeline.
|
//! a zenith Timeline.
|
||||||
//!
|
//!
|
||||||
use std::fs;
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{Read, Seek, SeekFrom};
|
use std::io::{Read, Seek, SeekFrom};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
@@ -10,16 +9,18 @@ use std::path::{Path, PathBuf};
|
|||||||
use anyhow::{bail, ensure, Context, Result};
|
use anyhow::{bail, ensure, Context, Result};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
use crate::pgdatadir_mapping::*;
|
use crate::pgdatadir_mapping::*;
|
||||||
use crate::reltag::{RelTag, SlruKind};
|
use crate::reltag::{RelTag, SlruKind};
|
||||||
use crate::repository::Repository;
|
use crate::repository::Repository;
|
||||||
|
use crate::repository::Timeline;
|
||||||
use crate::walingest::WalIngest;
|
use crate::walingest::WalIngest;
|
||||||
use postgres_ffi::relfile_utils::*;
|
use postgres_ffi::relfile_utils::*;
|
||||||
use postgres_ffi::waldecoder::*;
|
use postgres_ffi::waldecoder::*;
|
||||||
use postgres_ffi::xlog_utils::*;
|
use postgres_ffi::xlog_utils::*;
|
||||||
|
use postgres_ffi::Oid;
|
||||||
use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
|
use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
|
||||||
use postgres_ffi::{Oid, TransactionId};
|
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -35,100 +36,30 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
|
|||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut pg_control: Option<ControlFileData> = None;
|
let mut pg_control: Option<ControlFileData> = None;
|
||||||
|
|
||||||
|
// TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn)
|
||||||
|
// Then fishing out pg_control would be unnecessary
|
||||||
let mut modification = tline.begin_modification(lsn);
|
let mut modification = tline.begin_modification(lsn);
|
||||||
modification.init_empty()?;
|
modification.init_empty()?;
|
||||||
|
|
||||||
// Scan 'global'
|
// Import all but pg_wal
|
||||||
let mut relfiles: Vec<PathBuf> = Vec::new();
|
let all_but_wal = WalkDir::new(path)
|
||||||
for direntry in fs::read_dir(path.join("global"))? {
|
.into_iter()
|
||||||
let direntry = direntry?;
|
.filter_entry(|entry| !entry.path().ends_with("pg_wal"));
|
||||||
match direntry.file_name().to_str() {
|
for entry in all_but_wal {
|
||||||
None => continue,
|
let entry = entry?;
|
||||||
|
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
||||||
|
if metadata.is_file() {
|
||||||
|
let absolute_path = entry.path();
|
||||||
|
let relative_path = absolute_path.strip_prefix(path)?;
|
||||||
|
|
||||||
Some("pg_control") => {
|
let file = File::open(absolute_path)?;
|
||||||
pg_control = Some(import_control_file(&mut modification, &direntry.path())?);
|
let len = metadata.len() as usize;
|
||||||
|
if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
|
||||||
|
pg_control = Some(control_file);
|
||||||
}
|
}
|
||||||
Some("pg_filenode.map") => {
|
modification.flush()?;
|
||||||
import_relmap_file(
|
|
||||||
&mut modification,
|
|
||||||
pg_constants::GLOBALTABLESPACE_OID,
|
|
||||||
0,
|
|
||||||
&direntry.path(),
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Load any relation files into the page server (but only after the other files)
|
|
||||||
_ => relfiles.push(direntry.path()),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for relfile in relfiles {
|
|
||||||
import_relfile(
|
|
||||||
&mut modification,
|
|
||||||
&relfile,
|
|
||||||
pg_constants::GLOBALTABLESPACE_OID,
|
|
||||||
0,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Scan 'base'. It contains database dirs, the database OID is the filename.
|
|
||||||
// E.g. 'base/12345', where 12345 is the database OID.
|
|
||||||
for direntry in fs::read_dir(path.join("base"))? {
|
|
||||||
let direntry = direntry?;
|
|
||||||
|
|
||||||
//skip all temporary files
|
|
||||||
if direntry.file_name().to_string_lossy() == "pgsql_tmp" {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let dboid = direntry.file_name().to_string_lossy().parse::<u32>()?;
|
|
||||||
|
|
||||||
let mut relfiles: Vec<PathBuf> = Vec::new();
|
|
||||||
for direntry in fs::read_dir(direntry.path())? {
|
|
||||||
let direntry = direntry?;
|
|
||||||
match direntry.file_name().to_str() {
|
|
||||||
None => continue,
|
|
||||||
|
|
||||||
Some("PG_VERSION") => {
|
|
||||||
//modification.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?;
|
|
||||||
}
|
|
||||||
Some("pg_filenode.map") => import_relmap_file(
|
|
||||||
&mut modification,
|
|
||||||
pg_constants::DEFAULTTABLESPACE_OID,
|
|
||||||
dboid,
|
|
||||||
&direntry.path(),
|
|
||||||
)?,
|
|
||||||
|
|
||||||
// Load any relation files into the page server
|
|
||||||
_ => relfiles.push(direntry.path()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for relfile in relfiles {
|
|
||||||
import_relfile(
|
|
||||||
&mut modification,
|
|
||||||
&relfile,
|
|
||||||
pg_constants::DEFAULTTABLESPACE_OID,
|
|
||||||
dboid,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for entry in fs::read_dir(path.join("pg_xact"))? {
|
|
||||||
let entry = entry?;
|
|
||||||
import_slru_file(&mut modification, SlruKind::Clog, &entry.path())?;
|
|
||||||
}
|
|
||||||
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
|
|
||||||
let entry = entry?;
|
|
||||||
import_slru_file(&mut modification, SlruKind::MultiXactMembers, &entry.path())?;
|
|
||||||
}
|
|
||||||
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
|
|
||||||
let entry = entry?;
|
|
||||||
import_slru_file(&mut modification, SlruKind::MultiXactOffsets, &entry.path())?;
|
|
||||||
}
|
|
||||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
|
||||||
let entry = entry?;
|
|
||||||
let xid = u32::from_str_radix(&entry.path().to_string_lossy(), 16)?;
|
|
||||||
import_twophase_file(&mut modification, xid, &entry.path())?;
|
|
||||||
}
|
|
||||||
// TODO: Scan pg_tblspc
|
|
||||||
|
|
||||||
// We're done importing all the data files.
|
// We're done importing all the data files.
|
||||||
modification.commit()?;
|
modification.commit()?;
|
||||||
@@ -158,31 +89,30 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||||
fn import_relfile<R: Repository>(
|
fn import_rel<R: Repository, Reader: Read>(
|
||||||
modification: &mut DatadirModification<R>,
|
modification: &mut DatadirModification<R>,
|
||||||
path: &Path,
|
path: &Path,
|
||||||
spcoid: Oid,
|
spcoid: Oid,
|
||||||
dboid: Oid,
|
dboid: Oid,
|
||||||
|
mut reader: Reader,
|
||||||
|
len: usize,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Does it look like a relation file?
|
// Does it look like a relation file?
|
||||||
trace!("importing rel file {}", path.display());
|
trace!("importing rel file {}", path.display());
|
||||||
|
|
||||||
let (relnode, forknum, segno) = parse_relfilename(&path.file_name().unwrap().to_string_lossy())
|
let filename = &path
|
||||||
.map_err(|e| {
|
.file_name()
|
||||||
warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
|
.expect("missing rel filename")
|
||||||
e
|
.to_string_lossy();
|
||||||
})?;
|
let (relnode, forknum, segno) = parse_relfilename(filename).map_err(|e| {
|
||||||
|
warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
|
||||||
let mut file = File::open(path)?;
|
|
||||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||||
|
|
||||||
let len = file.metadata().unwrap().len();
|
ensure!(len % pg_constants::BLCKSZ as usize == 0);
|
||||||
ensure!(len % pg_constants::BLCKSZ as u64 == 0);
|
let nblocks = len / pg_constants::BLCKSZ as usize;
|
||||||
let nblocks = len / pg_constants::BLCKSZ as u64;
|
|
||||||
|
|
||||||
if segno != 0 {
|
|
||||||
todo!();
|
|
||||||
}
|
|
||||||
|
|
||||||
let rel = RelTag {
|
let rel = RelTag {
|
||||||
spcnode: spcoid,
|
spcnode: spcoid,
|
||||||
@@ -190,11 +120,22 @@ fn import_relfile<R: Repository>(
|
|||||||
relnode,
|
relnode,
|
||||||
forknum,
|
forknum,
|
||||||
};
|
};
|
||||||
modification.put_rel_creation(rel, nblocks as u32)?;
|
|
||||||
|
|
||||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||||
|
|
||||||
|
// Call put_rel_creation for every segment of the relation,
|
||||||
|
// because there is no guarantee about the order in which we are processing segments.
|
||||||
|
// ignore "relation already exists" error
|
||||||
|
if let Err(e) = modification.put_rel_creation(rel, nblocks as u32) {
|
||||||
|
if e.to_string().contains("already exists") {
|
||||||
|
debug!("relation {} already exists. we must be extending it", rel);
|
||||||
|
} else {
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let r = file.read_exact(&mut buf);
|
let r = reader.read_exact(&mut buf);
|
||||||
match r {
|
match r {
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
||||||
@@ -204,7 +145,9 @@ fn import_relfile<R: Repository>(
|
|||||||
Err(err) => match err.kind() {
|
Err(err) => match err.kind() {
|
||||||
std::io::ErrorKind::UnexpectedEof => {
|
std::io::ErrorKind::UnexpectedEof => {
|
||||||
// reached EOF. That's expected.
|
// reached EOF. That's expected.
|
||||||
ensure!(blknum == nblocks as u32, "unexpected EOF");
|
let relative_blknum =
|
||||||
|
blknum - segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||||
|
ensure!(relative_blknum == nblocks as u32, "unexpected EOF");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
@@ -215,96 +158,43 @@ fn import_relfile<R: Repository>(
|
|||||||
blknum += 1;
|
blknum += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Update relation size
|
||||||
|
//
|
||||||
|
// If we process rel segments out of order,
|
||||||
|
// put_rel_extend will skip the update.
|
||||||
|
modification.put_rel_extend(rel, blknum)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Import a relmapper (pg_filenode.map) file into the repository
|
|
||||||
fn import_relmap_file<R: Repository>(
|
|
||||||
modification: &mut DatadirModification<R>,
|
|
||||||
spcnode: Oid,
|
|
||||||
dbnode: Oid,
|
|
||||||
path: &Path,
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut file = File::open(path)?;
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
// read the whole file
|
|
||||||
file.read_to_end(&mut buffer)?;
|
|
||||||
|
|
||||||
trace!("importing relmap file {}", path.display());
|
|
||||||
|
|
||||||
modification.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Import a twophase state file (pg_twophase/<xid>) into the repository
|
|
||||||
fn import_twophase_file<R: Repository>(
|
|
||||||
modification: &mut DatadirModification<R>,
|
|
||||||
xid: TransactionId,
|
|
||||||
path: &Path,
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut file = File::open(path)?;
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
// read the whole file
|
|
||||||
file.read_to_end(&mut buffer)?;
|
|
||||||
|
|
||||||
trace!("importing non-rel file {}", path.display());
|
|
||||||
|
|
||||||
modification.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
/// Import pg_control file into the repository.
|
|
||||||
///
|
|
||||||
/// The control file is imported as is, but we also extract the checkpoint record
|
|
||||||
/// from it and store it separated.
|
|
||||||
fn import_control_file<R: Repository>(
|
|
||||||
modification: &mut DatadirModification<R>,
|
|
||||||
path: &Path,
|
|
||||||
) -> Result<ControlFileData> {
|
|
||||||
let mut file = File::open(path)?;
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
// read the whole file
|
|
||||||
file.read_to_end(&mut buffer)?;
|
|
||||||
|
|
||||||
trace!("importing control file {}", path.display());
|
|
||||||
|
|
||||||
// Import it as ControlFile
|
|
||||||
modification.put_control_file(Bytes::copy_from_slice(&buffer[..]))?;
|
|
||||||
|
|
||||||
// Extract the checkpoint record and import it separately.
|
|
||||||
let pg_control = ControlFileData::decode(&buffer)?;
|
|
||||||
let checkpoint_bytes = pg_control.checkPointCopy.encode()?;
|
|
||||||
modification.put_checkpoint(checkpoint_bytes)?;
|
|
||||||
|
|
||||||
Ok(pg_control)
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
/// Import an SLRU segment file
|
/// Import an SLRU segment file
|
||||||
///
|
///
|
||||||
fn import_slru_file<R: Repository>(
|
fn import_slru<R: Repository, Reader: Read>(
|
||||||
modification: &mut DatadirModification<R>,
|
modification: &mut DatadirModification<R>,
|
||||||
slru: SlruKind,
|
slru: SlruKind,
|
||||||
path: &Path,
|
path: &Path,
|
||||||
|
mut reader: Reader,
|
||||||
|
len: usize,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
trace!("importing slru file {}", path.display());
|
trace!("importing slru file {}", path.display());
|
||||||
|
|
||||||
let mut file = File::open(path)?;
|
|
||||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||||
let segno = u32::from_str_radix(&path.file_name().unwrap().to_string_lossy(), 16)?;
|
let filename = &path
|
||||||
|
.file_name()
|
||||||
|
.expect("missing slru filename")
|
||||||
|
.to_string_lossy();
|
||||||
|
let segno = u32::from_str_radix(filename, 16)?;
|
||||||
|
|
||||||
let len = file.metadata().unwrap().len();
|
ensure!(len % pg_constants::BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ
|
||||||
ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ
|
let nblocks = len / pg_constants::BLCKSZ as usize;
|
||||||
let nblocks = len / pg_constants::BLCKSZ as u64;
|
|
||||||
|
|
||||||
ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64);
|
ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize);
|
||||||
|
|
||||||
modification.put_slru_segment_creation(slru, segno, nblocks as u32)?;
|
modification.put_slru_segment_creation(slru, segno, nblocks as u32)?;
|
||||||
|
|
||||||
let mut rpageno = 0;
|
let mut rpageno = 0;
|
||||||
loop {
|
loop {
|
||||||
let r = file.read_exact(&mut buf);
|
let r = reader.read_exact(&mut buf);
|
||||||
match r {
|
match r {
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
modification.put_slru_page_image(
|
modification.put_slru_page_image(
|
||||||
@@ -396,10 +286,272 @@ fn import_wal<R: Repository>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if last_lsn != startpoint {
|
if last_lsn != startpoint {
|
||||||
debug!("reached end of WAL at {}", last_lsn);
|
info!("reached end of WAL at {}", last_lsn);
|
||||||
} else {
|
} else {
|
||||||
info!("no WAL to import at {}", last_lsn);
|
info!("no WAL to import at {}", last_lsn);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn import_basebackup_from_tar<R: Repository, Reader: Read>(
|
||||||
|
tline: &mut DatadirTimeline<R>,
|
||||||
|
reader: Reader,
|
||||||
|
base_lsn: Lsn,
|
||||||
|
) -> Result<()> {
|
||||||
|
info!("importing base at {}", base_lsn);
|
||||||
|
let mut modification = tline.begin_modification(base_lsn);
|
||||||
|
modification.init_empty()?;
|
||||||
|
|
||||||
|
let mut pg_control: Option<ControlFileData> = None;
|
||||||
|
|
||||||
|
// Import base
|
||||||
|
for base_tar_entry in tar::Archive::new(reader).entries()? {
|
||||||
|
let entry = base_tar_entry?;
|
||||||
|
let header = entry.header();
|
||||||
|
let len = header.entry_size()? as usize;
|
||||||
|
let file_path = header.path()?.into_owned();
|
||||||
|
|
||||||
|
match header.entry_type() {
|
||||||
|
tar::EntryType::Regular => {
|
||||||
|
if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? {
|
||||||
|
// We found the pg_control file.
|
||||||
|
pg_control = Some(res);
|
||||||
|
}
|
||||||
|
modification.flush()?;
|
||||||
|
}
|
||||||
|
tar::EntryType::Directory => {
|
||||||
|
debug!("directory {:?}", file_path);
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
panic!("tar::EntryType::?? {}", file_path.display());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sanity check: ensure that pg_control is loaded
|
||||||
|
let _pg_control = pg_control.context("pg_control file not found")?;
|
||||||
|
|
||||||
|
modification.commit()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn import_wal_from_tar<R: Repository, Reader: Read>(
|
||||||
|
tline: &mut DatadirTimeline<R>,
|
||||||
|
reader: Reader,
|
||||||
|
start_lsn: Lsn,
|
||||||
|
end_lsn: Lsn,
|
||||||
|
) -> Result<()> {
|
||||||
|
// Set up walingest mutable state
|
||||||
|
let mut waldecoder = WalStreamDecoder::new(start_lsn);
|
||||||
|
let mut segno = start_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||||
|
let mut offset = start_lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||||
|
let mut last_lsn = start_lsn;
|
||||||
|
let mut walingest = WalIngest::new(tline, start_lsn)?;
|
||||||
|
|
||||||
|
// Ingest wal until end_lsn
|
||||||
|
info!("importing wal until {}", end_lsn);
|
||||||
|
let mut pg_wal_tar = tar::Archive::new(reader);
|
||||||
|
let mut pg_wal_entries_iter = pg_wal_tar.entries()?;
|
||||||
|
while last_lsn <= end_lsn {
|
||||||
|
let bytes = {
|
||||||
|
let entry = pg_wal_entries_iter.next().expect("expected more wal")?;
|
||||||
|
let header = entry.header();
|
||||||
|
let file_path = header.path()?.into_owned();
|
||||||
|
|
||||||
|
match header.entry_type() {
|
||||||
|
tar::EntryType::Regular => {
|
||||||
|
// FIXME: assume postgresql tli 1 for now
|
||||||
|
let expected_filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||||
|
let file_name = file_path
|
||||||
|
.file_name()
|
||||||
|
.expect("missing wal filename")
|
||||||
|
.to_string_lossy();
|
||||||
|
ensure!(expected_filename == file_name);
|
||||||
|
|
||||||
|
debug!("processing wal file {:?}", file_path);
|
||||||
|
read_all_bytes(entry)?
|
||||||
|
}
|
||||||
|
tar::EntryType::Directory => {
|
||||||
|
debug!("directory {:?}", file_path);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
panic!("tar::EntryType::?? {}", file_path.display());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
waldecoder.feed_bytes(&bytes[offset..]);
|
||||||
|
|
||||||
|
while last_lsn <= end_lsn {
|
||||||
|
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||||
|
walingest.ingest_record(tline, recdata, lsn)?;
|
||||||
|
last_lsn = lsn;
|
||||||
|
|
||||||
|
debug!("imported record at {} (end {})", lsn, end_lsn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("imported records up to {}", last_lsn);
|
||||||
|
segno += 1;
|
||||||
|
offset = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if last_lsn != start_lsn {
|
||||||
|
info!("reached end of WAL at {}", last_lsn);
|
||||||
|
} else {
|
||||||
|
info!("there was no WAL to import at {}", last_lsn);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log any extra unused files
|
||||||
|
for e in &mut pg_wal_entries_iter {
|
||||||
|
let entry = e?;
|
||||||
|
let header = entry.header();
|
||||||
|
let file_path = header.path()?.into_owned();
|
||||||
|
info!("skipping {:?}", file_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn import_file<R: Repository, Reader: Read>(
|
||||||
|
modification: &mut DatadirModification<R>,
|
||||||
|
file_path: &Path,
|
||||||
|
reader: Reader,
|
||||||
|
len: usize,
|
||||||
|
) -> Result<Option<ControlFileData>> {
|
||||||
|
debug!("looking at {:?}", file_path);
|
||||||
|
|
||||||
|
if file_path.starts_with("global") {
|
||||||
|
let spcnode = pg_constants::GLOBALTABLESPACE_OID;
|
||||||
|
let dbnode = 0;
|
||||||
|
|
||||||
|
match file_path
|
||||||
|
.file_name()
|
||||||
|
.expect("missing filename")
|
||||||
|
.to_string_lossy()
|
||||||
|
.as_ref()
|
||||||
|
{
|
||||||
|
"pg_control" => {
|
||||||
|
let bytes = read_all_bytes(reader)?;
|
||||||
|
|
||||||
|
// Extract the checkpoint record and import it separately.
|
||||||
|
let pg_control = ControlFileData::decode(&bytes[..])?;
|
||||||
|
let checkpoint_bytes = pg_control.checkPointCopy.encode()?;
|
||||||
|
modification.put_checkpoint(checkpoint_bytes)?;
|
||||||
|
debug!("imported control file");
|
||||||
|
|
||||||
|
// Import it as ControlFile
|
||||||
|
modification.put_control_file(bytes)?;
|
||||||
|
return Ok(Some(pg_control));
|
||||||
|
}
|
||||||
|
"pg_filenode.map" => {
|
||||||
|
let bytes = read_all_bytes(reader)?;
|
||||||
|
modification.put_relmap_file(spcnode, dbnode, bytes)?;
|
||||||
|
debug!("imported relmap file")
|
||||||
|
}
|
||||||
|
"PG_VERSION" => {
|
||||||
|
debug!("ignored");
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
|
||||||
|
debug!("imported rel creation");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if file_path.starts_with("base") {
|
||||||
|
let spcnode = pg_constants::DEFAULTTABLESPACE_OID;
|
||||||
|
let dbnode: u32 = file_path
|
||||||
|
.iter()
|
||||||
|
.nth(1)
|
||||||
|
.expect("invalid file path, expected dbnode")
|
||||||
|
.to_string_lossy()
|
||||||
|
.parse()?;
|
||||||
|
|
||||||
|
match file_path
|
||||||
|
.file_name()
|
||||||
|
.expect("missing base filename")
|
||||||
|
.to_string_lossy()
|
||||||
|
.as_ref()
|
||||||
|
{
|
||||||
|
"pg_filenode.map" => {
|
||||||
|
let bytes = read_all_bytes(reader)?;
|
||||||
|
modification.put_relmap_file(spcnode, dbnode, bytes)?;
|
||||||
|
debug!("imported relmap file")
|
||||||
|
}
|
||||||
|
"PG_VERSION" => {
|
||||||
|
debug!("ignored");
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
|
||||||
|
debug!("imported rel creation");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if file_path.starts_with("pg_xact") {
|
||||||
|
let slru = SlruKind::Clog;
|
||||||
|
|
||||||
|
import_slru(modification, slru, file_path, reader, len)?;
|
||||||
|
debug!("imported clog slru");
|
||||||
|
} else if file_path.starts_with("pg_multixact/offsets") {
|
||||||
|
let slru = SlruKind::MultiXactOffsets;
|
||||||
|
|
||||||
|
import_slru(modification, slru, file_path, reader, len)?;
|
||||||
|
debug!("imported multixact offsets slru");
|
||||||
|
} else if file_path.starts_with("pg_multixact/members") {
|
||||||
|
let slru = SlruKind::MultiXactMembers;
|
||||||
|
|
||||||
|
import_slru(modification, slru, file_path, reader, len)?;
|
||||||
|
debug!("imported multixact members slru");
|
||||||
|
} else if file_path.starts_with("pg_twophase") {
|
||||||
|
let file_name = &file_path
|
||||||
|
.file_name()
|
||||||
|
.expect("missing twophase filename")
|
||||||
|
.to_string_lossy();
|
||||||
|
let xid = u32::from_str_radix(file_name, 16)?;
|
||||||
|
|
||||||
|
let bytes = read_all_bytes(reader)?;
|
||||||
|
modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;
|
||||||
|
debug!("imported twophase file");
|
||||||
|
} else if file_path.starts_with("pg_wal") {
|
||||||
|
debug!("found wal file in base section. ignore it");
|
||||||
|
} else if file_path.starts_with("zenith.signal") {
|
||||||
|
// Parse zenith signal file to set correct previous LSN
|
||||||
|
let bytes = read_all_bytes(reader)?;
|
||||||
|
// zenith.signal format is "PREV LSN: prev_lsn"
|
||||||
|
// TODO write serialization and deserialization in the same place.
|
||||||
|
let zenith_signal = std::str::from_utf8(&bytes)?.trim();
|
||||||
|
let prev_lsn = match zenith_signal {
|
||||||
|
"PREV LSN: none" => Lsn(0),
|
||||||
|
"PREV LSN: invalid" => Lsn(0),
|
||||||
|
other => {
|
||||||
|
let split = other.split(':').collect::<Vec<_>>();
|
||||||
|
split[1]
|
||||||
|
.trim()
|
||||||
|
.parse::<Lsn>()
|
||||||
|
.context("can't parse zenith.signal")?
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// zenith.signal is not necessarily the last file, that we handle
|
||||||
|
// but it is ok to call `finish_write()`, because final `modification.commit()`
|
||||||
|
// will update lsn once more to the final one.
|
||||||
|
let writer = modification.tline.tline.writer();
|
||||||
|
writer.finish_write(prev_lsn);
|
||||||
|
|
||||||
|
debug!("imported zenith signal {}", prev_lsn);
|
||||||
|
} else if file_path.starts_with("pg_tblspc") {
|
||||||
|
// TODO Backups exported from neon won't have pg_tblspc, but we will need
|
||||||
|
// this to import arbitrary postgres databases.
|
||||||
|
bail!("Importing pg_tblspc is not implemented");
|
||||||
|
} else {
|
||||||
|
debug!("ignored");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_all_bytes<Reader: Read>(mut reader: Reader) -> Result<Bytes> {
|
||||||
|
let mut buf: Vec<u8> = vec![];
|
||||||
|
reader.read_to_end(&mut buf)?;
|
||||||
|
Ok(Bytes::copy_from_slice(&buf[..]))
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
//! The functions here are responsible for locating the correct layer for the
|
//! The functions here are responsible for locating the correct layer for the
|
||||||
//! get/put call, tracing timeline branching history as needed.
|
//! get/put call, tracing timeline branching history as needed.
|
||||||
//!
|
//!
|
||||||
//! The files are stored in the .zenith/tenants/<tenantid>/timelines/<timelineid>
|
//! The files are stored in the .neon/tenants/<tenantid>/timelines/<timelineid>
|
||||||
//! directory. See layered_repository/README for how the files are managed.
|
//! directory. See layered_repository/README for how the files are managed.
|
||||||
//! In addition to the layer files, there is a metadata file in the same
|
//! In addition to the layer files, there is a metadata file in the same
|
||||||
//! directory that contains information about the timeline, in particular its
|
//! directory that contains information about the timeline, in particular its
|
||||||
@@ -34,13 +34,11 @@ use std::time::{Duration, Instant, SystemTime};
|
|||||||
|
|
||||||
use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME};
|
use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME};
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::keyspace::KeySpace;
|
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||||
use crate::storage_sync::index::RemoteIndex;
|
use crate::storage_sync::index::RemoteIndex;
|
||||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||||
|
|
||||||
use crate::repository::{
|
use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline, TimelineWriter};
|
||||||
GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter,
|
|
||||||
};
|
|
||||||
use crate::repository::{Key, Value};
|
use crate::repository::{Key, Value};
|
||||||
use crate::tenant_mgr;
|
use crate::tenant_mgr;
|
||||||
use crate::thread_mgr;
|
use crate::thread_mgr;
|
||||||
@@ -148,7 +146,7 @@ lazy_static! {
|
|||||||
.expect("failed to define a metric");
|
.expect("failed to define a metric");
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parts of the `.zenith/tenants/<tenantid>/timelines/<timelineid>` directory prefix.
|
/// Parts of the `.neon/tenants/<tenantid>/timelines/<timelineid>` directory prefix.
|
||||||
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -158,6 +156,18 @@ pub struct LayeredRepository {
|
|||||||
// Global pageserver config parameters
|
// Global pageserver config parameters
|
||||||
pub conf: &'static PageServerConf,
|
pub conf: &'static PageServerConf,
|
||||||
|
|
||||||
|
// Allows us to gracefully cancel operations that edit the directory
|
||||||
|
// that backs this layered repository. Usage:
|
||||||
|
//
|
||||||
|
// Use `let _guard = file_lock.try_read()` while writing any files.
|
||||||
|
// Use `let _guard = file_lock.write().unwrap()` to wait for all writes to finish.
|
||||||
|
//
|
||||||
|
// TODO try_read this lock during checkpoint as well to prevent race
|
||||||
|
// between checkpoint and detach/delete.
|
||||||
|
// TODO try_read this lock for all gc/compaction operations, not just
|
||||||
|
// ones scheduled by the tenant task manager.
|
||||||
|
pub file_lock: RwLock<()>,
|
||||||
|
|
||||||
// Overridden tenant-specific config parameters.
|
// Overridden tenant-specific config parameters.
|
||||||
// We keep TenantConfOpt sturct here to preserve the information
|
// We keep TenantConfOpt sturct here to preserve the information
|
||||||
// about parameters that are not set.
|
// about parameters that are not set.
|
||||||
@@ -220,43 +230,52 @@ impl Repository for LayeredRepository {
|
|||||||
|
|
||||||
fn create_empty_timeline(
|
fn create_empty_timeline(
|
||||||
&self,
|
&self,
|
||||||
timelineid: ZTimelineId,
|
timeline_id: ZTimelineId,
|
||||||
initdb_lsn: Lsn,
|
initdb_lsn: Lsn,
|
||||||
) -> Result<Arc<LayeredTimeline>> {
|
) -> Result<Arc<LayeredTimeline>> {
|
||||||
let mut timelines = self.timelines.lock().unwrap();
|
let mut timelines = self.timelines.lock().unwrap();
|
||||||
|
let vacant_timeline_entry = match timelines.entry(timeline_id) {
|
||||||
|
Entry::Occupied(_) => bail!("Timeline already exists"),
|
||||||
|
Entry::Vacant(vacant_entry) => vacant_entry,
|
||||||
|
};
|
||||||
|
|
||||||
|
let timeline_path = self.conf.timeline_path(&timeline_id, &self.tenant_id);
|
||||||
|
if timeline_path.exists() {
|
||||||
|
bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.")
|
||||||
|
}
|
||||||
|
|
||||||
// Create the timeline directory, and write initial metadata to file.
|
// Create the timeline directory, and write initial metadata to file.
|
||||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenant_id))?;
|
crashsafe_dir::create_dir_all(timeline_path)?;
|
||||||
|
|
||||||
let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn);
|
let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn);
|
||||||
Self::save_metadata(self.conf, timelineid, self.tenant_id, &metadata, true)?;
|
Self::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?;
|
||||||
|
|
||||||
let timeline = LayeredTimeline::new(
|
let timeline = LayeredTimeline::new(
|
||||||
self.conf,
|
self.conf,
|
||||||
Arc::clone(&self.tenant_conf),
|
Arc::clone(&self.tenant_conf),
|
||||||
metadata,
|
metadata,
|
||||||
None,
|
None,
|
||||||
timelineid,
|
timeline_id,
|
||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
Arc::clone(&self.walredo_mgr),
|
Arc::clone(&self.walredo_mgr),
|
||||||
self.upload_layers,
|
self.upload_layers,
|
||||||
);
|
);
|
||||||
timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn);
|
timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn);
|
||||||
|
|
||||||
|
// Insert if not exists
|
||||||
let timeline = Arc::new(timeline);
|
let timeline = Arc::new(timeline);
|
||||||
let r = timelines.insert(
|
vacant_timeline_entry.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline)));
|
||||||
timelineid,
|
|
||||||
LayeredTimelineEntry::Loaded(Arc::clone(&timeline)),
|
|
||||||
);
|
|
||||||
ensure!(
|
|
||||||
r.is_none(),
|
|
||||||
"assertion failure, inserted duplicate timeline"
|
|
||||||
);
|
|
||||||
Ok(timeline)
|
Ok(timeline)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Branch a timeline
|
/// Branch a timeline
|
||||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()> {
|
fn branch_timeline(
|
||||||
|
&self,
|
||||||
|
src: ZTimelineId,
|
||||||
|
dst: ZTimelineId,
|
||||||
|
start_lsn: Option<Lsn>,
|
||||||
|
) -> Result<()> {
|
||||||
// We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
|
// We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
|
||||||
// about timelines, so otherwise a race condition is possible, where we create new timeline and GC
|
// about timelines, so otherwise a race condition is possible, where we create new timeline and GC
|
||||||
// concurrently removes data that is needed by the new timeline.
|
// concurrently removes data that is needed by the new timeline.
|
||||||
@@ -269,6 +288,14 @@ impl Repository for LayeredRepository {
|
|||||||
.context("failed to load timeline for branching")?
|
.context("failed to load timeline for branching")?
|
||||||
.ok_or_else(|| anyhow::anyhow!("unknown timeline id: {}", &src))?;
|
.ok_or_else(|| anyhow::anyhow!("unknown timeline id: {}", &src))?;
|
||||||
let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
|
||||||
|
|
||||||
|
// If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
|
||||||
|
let start_lsn = start_lsn.unwrap_or_else(|| {
|
||||||
|
let lsn = src_timeline.get_last_record_lsn();
|
||||||
|
info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}");
|
||||||
|
lsn
|
||||||
|
});
|
||||||
|
|
||||||
src_timeline
|
src_timeline
|
||||||
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
|
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
|
||||||
.context("invalid branch start lsn")?;
|
.context("invalid branch start lsn")?;
|
||||||
@@ -315,19 +342,19 @@ impl Repository for LayeredRepository {
|
|||||||
/// metrics collection.
|
/// metrics collection.
|
||||||
fn gc_iteration(
|
fn gc_iteration(
|
||||||
&self,
|
&self,
|
||||||
target_timelineid: Option<ZTimelineId>,
|
target_timeline_id: Option<ZTimelineId>,
|
||||||
horizon: u64,
|
horizon: u64,
|
||||||
pitr: Duration,
|
pitr: Duration,
|
||||||
checkpoint_before_gc: bool,
|
checkpoint_before_gc: bool,
|
||||||
) -> Result<GcResult> {
|
) -> Result<GcResult> {
|
||||||
let timeline_str = target_timelineid
|
let timeline_str = target_timeline_id
|
||||||
.map(|x| x.to_string())
|
.map(|x| x.to_string())
|
||||||
.unwrap_or_else(|| "-".to_string());
|
.unwrap_or_else(|| "-".to_string());
|
||||||
|
|
||||||
STORAGE_TIME
|
STORAGE_TIME
|
||||||
.with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
|
.with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
|
||||||
.observe_closure_duration(|| {
|
.observe_closure_duration(|| {
|
||||||
self.gc_iteration_internal(target_timelineid, horizon, pitr, checkpoint_before_gc)
|
self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -394,50 +421,60 @@ impl Repository for LayeredRepository {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn detach_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
||||||
|
// in order to be retriable detach needs to be idempotent
|
||||||
|
// (or at least to a point that each time the detach is called it can make progress)
|
||||||
let mut timelines = self.timelines.lock().unwrap();
|
let mut timelines = self.timelines.lock().unwrap();
|
||||||
// check no child timelines, because detach will remove files, which will brake child branches
|
|
||||||
// FIXME this can still be violated because we do not guarantee
|
// Ensure that there are no child timelines **attached to that pageserver**,
|
||||||
// that all ancestors are downloaded/attached to the same pageserver
|
// because detach removes files, which will break child branches
|
||||||
let num_children = timelines
|
let children_exist = timelines
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id))
|
.any(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id));
|
||||||
.count();
|
|
||||||
|
|
||||||
ensure!(
|
ensure!(
|
||||||
num_children == 0,
|
!children_exist,
|
||||||
"Cannot detach timeline which has child timelines"
|
"Cannot detach timeline which has child timelines"
|
||||||
);
|
);
|
||||||
|
let timeline_entry = match timelines.entry(timeline_id) {
|
||||||
|
Entry::Occupied(e) => e,
|
||||||
|
Entry::Vacant(_) => bail!("timeline not found"),
|
||||||
|
};
|
||||||
|
|
||||||
|
// try to acquire gc and compaction locks to prevent errors from missing files
|
||||||
|
let _gc_guard = self
|
||||||
|
.gc_cs
|
||||||
|
.try_lock()
|
||||||
|
.map_err(|e| anyhow::anyhow!("cannot acquire gc lock {e}"))?;
|
||||||
|
|
||||||
|
let compaction_guard = timeline_entry.get().compaction_guard()?;
|
||||||
|
|
||||||
|
let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
|
||||||
|
std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to remove local timeline directory '{}'",
|
||||||
|
local_timeline_directory.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
info!("detach removed files");
|
||||||
|
|
||||||
|
drop(compaction_guard);
|
||||||
|
timeline_entry.remove();
|
||||||
|
|
||||||
ensure!(
|
|
||||||
timelines.remove(&timeline_id).is_some(),
|
|
||||||
"Cannot detach timeline {timeline_id} that is not available locally"
|
|
||||||
);
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn apply_timeline_remote_sync_status_update(
|
fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> {
|
||||||
&self,
|
debug!("attach timeline_id: {}", timeline_id,);
|
||||||
timeline_id: ZTimelineId,
|
match self.timelines.lock().unwrap().entry(timeline_id) {
|
||||||
timeline_sync_status_update: TimelineSyncStatusUpdate,
|
Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."),
|
||||||
) -> Result<()> {
|
Entry::Vacant(entry) => {
|
||||||
debug!(
|
// we need to get metadata of a timeline, another option is to pass it along with Downloaded status
|
||||||
"apply_timeline_remote_sync_status_update timeline_id: {} update: {:?}",
|
let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?;
|
||||||
timeline_id, timeline_sync_status_update
|
// finally we make newly downloaded timeline visible to repository
|
||||||
);
|
entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, })
|
||||||
match timeline_sync_status_update {
|
},
|
||||||
TimelineSyncStatusUpdate::Downloaded => {
|
};
|
||||||
match self.timelines.lock().unwrap().entry(timeline_id) {
|
|
||||||
Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."),
|
|
||||||
Entry::Vacant(entry) => {
|
|
||||||
// we need to get metadata of a timeline, another option is to pass it along with Downloaded status
|
|
||||||
let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?;
|
|
||||||
// finally we make newly downloaded timeline visible to repository
|
|
||||||
entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, })
|
|
||||||
},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -487,6 +524,18 @@ impl LayeredTimelineEntry {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn compaction_guard(&self) -> Result<Option<MutexGuard<()>>, anyhow::Error> {
|
||||||
|
match self {
|
||||||
|
LayeredTimelineEntry::Loaded(timeline) => timeline
|
||||||
|
.compaction_cs
|
||||||
|
.try_lock()
|
||||||
|
.map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}"))
|
||||||
|
.map(Some),
|
||||||
|
|
||||||
|
LayeredTimelineEntry::Unloaded { .. } => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<LayeredTimelineEntry> for RepositoryTimeline<LayeredTimeline> {
|
impl From<LayeredTimelineEntry> for RepositoryTimeline<LayeredTimeline> {
|
||||||
@@ -685,6 +734,7 @@ impl LayeredRepository {
|
|||||||
) -> LayeredRepository {
|
) -> LayeredRepository {
|
||||||
LayeredRepository {
|
LayeredRepository {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
|
file_lock: RwLock::new(()),
|
||||||
conf,
|
conf,
|
||||||
tenant_conf: Arc::new(RwLock::new(tenant_conf)),
|
tenant_conf: Arc::new(RwLock::new(tenant_conf)),
|
||||||
timelines: Mutex::new(HashMap::new()),
|
timelines: Mutex::new(HashMap::new()),
|
||||||
@@ -822,13 +872,13 @@ impl LayeredRepository {
|
|||||||
// we do.
|
// we do.
|
||||||
fn gc_iteration_internal(
|
fn gc_iteration_internal(
|
||||||
&self,
|
&self,
|
||||||
target_timelineid: Option<ZTimelineId>,
|
target_timeline_id: Option<ZTimelineId>,
|
||||||
horizon: u64,
|
horizon: u64,
|
||||||
pitr: Duration,
|
pitr: Duration,
|
||||||
checkpoint_before_gc: bool,
|
checkpoint_before_gc: bool,
|
||||||
) -> Result<GcResult> {
|
) -> Result<GcResult> {
|
||||||
let _span_guard =
|
let _span_guard =
|
||||||
info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timelineid)
|
info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timeline_id)
|
||||||
.entered();
|
.entered();
|
||||||
let mut totals: GcResult = Default::default();
|
let mut totals: GcResult = Default::default();
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
@@ -842,6 +892,12 @@ impl LayeredRepository {
|
|||||||
let mut timeline_ids = Vec::new();
|
let mut timeline_ids = Vec::new();
|
||||||
let mut timelines = self.timelines.lock().unwrap();
|
let mut timelines = self.timelines.lock().unwrap();
|
||||||
|
|
||||||
|
if let Some(target_timeline_id) = target_timeline_id.as_ref() {
|
||||||
|
if timelines.get(target_timeline_id).is_none() {
|
||||||
|
bail!("gc target timeline does not exist")
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
for (timeline_id, timeline_entry) in timelines.iter() {
|
for (timeline_id, timeline_entry) in timelines.iter() {
|
||||||
timeline_ids.push(*timeline_id);
|
timeline_ids.push(*timeline_id);
|
||||||
|
|
||||||
@@ -850,7 +906,7 @@ impl LayeredRepository {
|
|||||||
// Somewhat related: https://github.com/zenithdb/zenith/issues/999
|
// Somewhat related: https://github.com/zenithdb/zenith/issues/999
|
||||||
if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() {
|
if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() {
|
||||||
// If target_timeline is specified, we only need to know branchpoints of its children
|
// If target_timeline is specified, we only need to know branchpoints of its children
|
||||||
if let Some(timelineid) = target_timelineid {
|
if let Some(timelineid) = target_timeline_id {
|
||||||
if ancestor_timeline_id == &timelineid {
|
if ancestor_timeline_id == &timelineid {
|
||||||
all_branchpoints
|
all_branchpoints
|
||||||
.insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn()));
|
.insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn()));
|
||||||
@@ -865,7 +921,7 @@ impl LayeredRepository {
|
|||||||
|
|
||||||
// Ok, we now know all the branch points.
|
// Ok, we now know all the branch points.
|
||||||
// Perform GC for each timeline.
|
// Perform GC for each timeline.
|
||||||
for timelineid in timeline_ids.into_iter() {
|
for timeline_id in timeline_ids.into_iter() {
|
||||||
if thread_mgr::is_shutdown_requested() {
|
if thread_mgr::is_shutdown_requested() {
|
||||||
// We were requested to shut down. Stop and return with the progress we
|
// We were requested to shut down. Stop and return with the progress we
|
||||||
// made.
|
// made.
|
||||||
@@ -874,12 +930,12 @@ impl LayeredRepository {
|
|||||||
|
|
||||||
// Timeline is known to be local and loaded.
|
// Timeline is known to be local and loaded.
|
||||||
let timeline = self
|
let timeline = self
|
||||||
.get_timeline_load_internal(timelineid, &mut *timelines)?
|
.get_timeline_load_internal(timeline_id, &mut *timelines)?
|
||||||
.expect("checked above that timeline is local and loaded");
|
.expect("checked above that timeline is local and loaded");
|
||||||
|
|
||||||
// If target_timeline is specified, only GC it
|
// If target_timeline is specified, only GC it
|
||||||
if let Some(target_timelineid) = target_timelineid {
|
if let Some(target_timelineid) = target_timeline_id {
|
||||||
if timelineid != target_timelineid {
|
if timeline_id != target_timelineid {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -888,8 +944,8 @@ impl LayeredRepository {
|
|||||||
drop(timelines);
|
drop(timelines);
|
||||||
let branchpoints: Vec<Lsn> = all_branchpoints
|
let branchpoints: Vec<Lsn> = all_branchpoints
|
||||||
.range((
|
.range((
|
||||||
Included((timelineid, Lsn(0))),
|
Included((timeline_id, Lsn(0))),
|
||||||
Included((timelineid, Lsn(u64::MAX))),
|
Included((timeline_id, Lsn(u64::MAX))),
|
||||||
))
|
))
|
||||||
.map(|&x| x.1)
|
.map(|&x| x.1)
|
||||||
.collect();
|
.collect();
|
||||||
@@ -899,7 +955,7 @@ impl LayeredRepository {
|
|||||||
// used in tests, so we want as deterministic results as possible.
|
// used in tests, so we want as deterministic results as possible.
|
||||||
if checkpoint_before_gc {
|
if checkpoint_before_gc {
|
||||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||||
info!("timeline {} checkpoint_before_gc done", timelineid);
|
info!("timeline {} checkpoint_before_gc done", timeline_id);
|
||||||
}
|
}
|
||||||
timeline.update_gc_info(branchpoints, cutoff, pitr);
|
timeline.update_gc_info(branchpoints, cutoff, pitr);
|
||||||
let result = timeline.gc()?;
|
let result = timeline.gc()?;
|
||||||
@@ -1584,7 +1640,7 @@ impl LayeredTimeline {
|
|||||||
Ok(layer)
|
Ok(layer)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||||
//info!("PUT: key {} at {}", key, lsn);
|
//info!("PUT: key {} at {}", key, lsn);
|
||||||
let layer = self.get_layer_for_write(lsn)?;
|
let layer = self.get_layer_for_write(lsn)?;
|
||||||
layer.put_value(key, lsn, val)?;
|
layer.put_value(key, lsn, val)?;
|
||||||
@@ -1712,24 +1768,29 @@ impl LayeredTimeline {
|
|||||||
|
|
||||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||||
fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> Result<()> {
|
fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> Result<()> {
|
||||||
let new_delta = frozen_layer.write_to_disk()?;
|
let layer_paths_to_upload;
|
||||||
let new_delta_path = new_delta.path();
|
|
||||||
|
// As a special case, when we have just imported an image into the repository,
|
||||||
|
// instead of writing out a L0 delta layer, we directly write out image layer
|
||||||
|
// files instead. This is possible as long as *all* the data imported into the
|
||||||
|
// repository have the same LSN.
|
||||||
|
let lsn_range = frozen_layer.get_lsn_range();
|
||||||
|
if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
|
||||||
|
let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?;
|
||||||
|
let (partitioning, _lsn) =
|
||||||
|
pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?;
|
||||||
|
layer_paths_to_upload =
|
||||||
|
self.create_image_layers(&partitioning, self.initdb_lsn, true)?;
|
||||||
|
} else {
|
||||||
|
// normal case, write out a L0 delta layer file.
|
||||||
|
let delta_path = self.create_delta_layer(&frozen_layer)?;
|
||||||
|
layer_paths_to_upload = HashSet::from([delta_path]);
|
||||||
|
}
|
||||||
|
|
||||||
// Sync the new layer to disk.
|
|
||||||
//
|
|
||||||
// We must also fsync the timeline dir to ensure the directory entries for
|
|
||||||
// new layer files are durable
|
|
||||||
//
|
|
||||||
// TODO: If we're running inside 'flush_frozen_layers' and there are multiple
|
|
||||||
// files to flush, it might be better to first write them all, and then fsync
|
|
||||||
// them all in parallel.
|
|
||||||
par_fsync::par_fsync(&[
|
|
||||||
new_delta_path.clone(),
|
|
||||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
|
||||||
])?;
|
|
||||||
fail_point!("flush-frozen-before-sync");
|
fail_point!("flush-frozen-before-sync");
|
||||||
|
|
||||||
// Finally, replace the frozen in-memory layer with the new on-disk layer
|
// The new on-disk layers are now in the layer map. We can remove the
|
||||||
|
// in-memory layer from the map now.
|
||||||
{
|
{
|
||||||
let mut layers = self.layers.write().unwrap();
|
let mut layers = self.layers.write().unwrap();
|
||||||
let l = layers.frozen_layers.pop_front();
|
let l = layers.frozen_layers.pop_front();
|
||||||
@@ -1739,19 +1800,27 @@ impl LayeredTimeline {
|
|||||||
// layer to disk at the same time, that would not work.
|
// layer to disk at the same time, that would not work.
|
||||||
assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer));
|
assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer));
|
||||||
|
|
||||||
// Add the new delta layer to the LayerMap
|
|
||||||
layers.insert_historic(Arc::new(new_delta));
|
|
||||||
|
|
||||||
// release lock on 'layers'
|
// release lock on 'layers'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fail_point!("checkpoint-after-sync");
|
||||||
|
|
||||||
// Update the metadata file, with new 'disk_consistent_lsn'
|
// Update the metadata file, with new 'disk_consistent_lsn'
|
||||||
//
|
//
|
||||||
// TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
|
// TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
|
||||||
// *all* the layers, to avoid fsyncing the file multiple times.
|
// *all* the layers, to avoid fsyncing the file multiple times.
|
||||||
let disk_consistent_lsn = Lsn(frozen_layer.get_lsn_range().end.0 - 1);
|
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
||||||
fail_point!("checkpoint-after-sync");
|
self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update metadata file
|
||||||
|
fn update_disk_consistent_lsn(
|
||||||
|
&self,
|
||||||
|
disk_consistent_lsn: Lsn,
|
||||||
|
layer_paths_to_upload: HashSet<PathBuf>,
|
||||||
|
) -> Result<()> {
|
||||||
// If we were able to advance 'disk_consistent_lsn', save it the metadata file.
|
// If we were able to advance 'disk_consistent_lsn', save it the metadata file.
|
||||||
// After crash, we will restart WAL streaming and processing from that point.
|
// After crash, we will restart WAL streaming and processing from that point.
|
||||||
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||||
@@ -1801,14 +1870,11 @@ impl LayeredTimeline {
|
|||||||
false,
|
false,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
NUM_PERSISTENT_FILES_CREATED.inc_by(1);
|
|
||||||
PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len());
|
|
||||||
|
|
||||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||||
storage_sync::schedule_layer_upload(
|
storage_sync::schedule_layer_upload(
|
||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
HashSet::from([new_delta_path]),
|
layer_paths_to_upload,
|
||||||
Some(metadata),
|
Some(metadata),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -1820,6 +1886,37 @@ impl LayeredTimeline {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Write out the given frozen in-memory layer as a new L0 delta file
|
||||||
|
fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result<PathBuf> {
|
||||||
|
// Write it out
|
||||||
|
let new_delta = frozen_layer.write_to_disk()?;
|
||||||
|
let new_delta_path = new_delta.path();
|
||||||
|
|
||||||
|
// Sync it to disk.
|
||||||
|
//
|
||||||
|
// We must also fsync the timeline dir to ensure the directory entries for
|
||||||
|
// new layer files are durable
|
||||||
|
//
|
||||||
|
// TODO: If we're running inside 'flush_frozen_layers' and there are multiple
|
||||||
|
// files to flush, it might be better to first write them all, and then fsync
|
||||||
|
// them all in parallel.
|
||||||
|
par_fsync::par_fsync(&[
|
||||||
|
new_delta_path.clone(),
|
||||||
|
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||||
|
])?;
|
||||||
|
|
||||||
|
// Add it to the layer map
|
||||||
|
{
|
||||||
|
let mut layers = self.layers.write().unwrap();
|
||||||
|
layers.insert_historic(Arc::new(new_delta));
|
||||||
|
}
|
||||||
|
|
||||||
|
NUM_PERSISTENT_FILES_CREATED.inc_by(1);
|
||||||
|
PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len());
|
||||||
|
|
||||||
|
Ok(new_delta_path)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn compact(&self) -> Result<()> {
|
pub fn compact(&self) -> Result<()> {
|
||||||
//
|
//
|
||||||
// High level strategy for compaction / image creation:
|
// High level strategy for compaction / image creation:
|
||||||
@@ -1863,29 +1960,23 @@ impl LayeredTimeline {
|
|||||||
if let Ok(pgdir) =
|
if let Ok(pgdir) =
|
||||||
tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)
|
tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)
|
||||||
{
|
{
|
||||||
|
// 2. Create new image layers for partitions that have been modified
|
||||||
|
// "enough".
|
||||||
let (partitioning, lsn) = pgdir.repartition(
|
let (partitioning, lsn) = pgdir.repartition(
|
||||||
self.get_last_record_lsn(),
|
self.get_last_record_lsn(),
|
||||||
self.get_compaction_target_size(),
|
self.get_compaction_target_size(),
|
||||||
)?;
|
)?;
|
||||||
let timer = self.create_images_time_histo.start_timer();
|
let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
|
||||||
// 2. Create new image layers for partitions that have been modified
|
if !layer_paths_to_upload.is_empty()
|
||||||
// "enough".
|
&& self.upload_layers.load(atomic::Ordering::Relaxed)
|
||||||
let mut layer_paths_to_upload = HashSet::with_capacity(partitioning.parts.len());
|
{
|
||||||
for part in partitioning.parts.iter() {
|
|
||||||
if self.time_for_new_image_layer(part, lsn)? {
|
|
||||||
let new_path = self.create_image_layer(part, lsn)?;
|
|
||||||
layer_paths_to_upload.insert(new_path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
|
||||||
storage_sync::schedule_layer_upload(
|
storage_sync::schedule_layer_upload(
|
||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
layer_paths_to_upload,
|
HashSet::from_iter(layer_paths_to_upload),
|
||||||
None,
|
None,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
timer.stop_and_record();
|
|
||||||
|
|
||||||
// 3. Compact
|
// 3. Compact
|
||||||
let timer = self.compact_time_histo.start_timer();
|
let timer = self.compact_time_histo.start_timer();
|
||||||
@@ -1910,15 +2001,28 @@ impl LayeredTimeline {
|
|||||||
} else {
|
} else {
|
||||||
Lsn(0)
|
Lsn(0)
|
||||||
};
|
};
|
||||||
|
// Let's consider an example:
|
||||||
|
//
|
||||||
|
// delta layer with LSN range 71-81
|
||||||
|
// delta layer with LSN range 81-91
|
||||||
|
// delta layer with LSN range 91-101
|
||||||
|
// image layer at LSN 100
|
||||||
|
//
|
||||||
|
// If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer,
|
||||||
|
// there's no need to create a new one. We check this case explicitly, to avoid passing
|
||||||
|
// a bogus range to count_deltas below, with start > end. It's even possible that there
|
||||||
|
// are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
|
||||||
|
// after we read last_record_lsn, which is passed here in the 'lsn' argument.
|
||||||
|
if img_lsn < lsn {
|
||||||
|
let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
|
||||||
|
|
||||||
let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
|
debug!(
|
||||||
|
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
||||||
debug!(
|
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
||||||
"range {}-{}, has {} deltas on this timeline",
|
);
|
||||||
img_range.start, img_range.end, num_deltas
|
if num_deltas >= self.get_image_creation_threshold() {
|
||||||
);
|
return Ok(true);
|
||||||
if num_deltas >= self.get_image_creation_threshold() {
|
}
|
||||||
return Ok(true);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1926,21 +2030,40 @@ impl LayeredTimeline {
|
|||||||
Ok(false)
|
Ok(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<PathBuf> {
|
fn create_image_layers(
|
||||||
let img_range =
|
&self,
|
||||||
partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
|
partitioning: &KeyPartitioning,
|
||||||
let mut image_layer_writer =
|
lsn: Lsn,
|
||||||
ImageLayerWriter::new(self.conf, self.timeline_id, self.tenant_id, &img_range, lsn)?;
|
force: bool,
|
||||||
|
) -> Result<HashSet<PathBuf>> {
|
||||||
|
let timer = self.create_images_time_histo.start_timer();
|
||||||
|
let mut image_layers: Vec<ImageLayer> = Vec::new();
|
||||||
|
let mut layer_paths_to_upload = HashSet::new();
|
||||||
|
for partition in partitioning.parts.iter() {
|
||||||
|
if force || self.time_for_new_image_layer(partition, lsn)? {
|
||||||
|
let img_range =
|
||||||
|
partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
|
||||||
|
let mut image_layer_writer = ImageLayerWriter::new(
|
||||||
|
self.conf,
|
||||||
|
self.timeline_id,
|
||||||
|
self.tenant_id,
|
||||||
|
&img_range,
|
||||||
|
lsn,
|
||||||
|
)?;
|
||||||
|
|
||||||
for range in &partition.ranges {
|
for range in &partition.ranges {
|
||||||
let mut key = range.start;
|
let mut key = range.start;
|
||||||
while key < range.end {
|
while key < range.end {
|
||||||
let img = self.get(key, lsn)?;
|
let img = self.get(key, lsn)?;
|
||||||
image_layer_writer.put_image(key, &img)?;
|
image_layer_writer.put_image(key, &img)?;
|
||||||
key = key.next();
|
key = key.next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let image_layer = image_layer_writer.finish()?;
|
||||||
|
layer_paths_to_upload.insert(image_layer.path());
|
||||||
|
image_layers.push(image_layer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let image_layer = image_layer_writer.finish()?;
|
|
||||||
|
|
||||||
// Sync the new layer to disk before adding it to the layer map, to make sure
|
// Sync the new layer to disk before adding it to the layer map, to make sure
|
||||||
// we don't garbage collect something based on the new layer, before it has
|
// we don't garbage collect something based on the new layer, before it has
|
||||||
@@ -1951,19 +2074,18 @@ impl LayeredTimeline {
|
|||||||
//
|
//
|
||||||
// Compaction creates multiple image layers. It would be better to create them all
|
// Compaction creates multiple image layers. It would be better to create them all
|
||||||
// and fsync them all in parallel.
|
// and fsync them all in parallel.
|
||||||
par_fsync::par_fsync(&[
|
let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone());
|
||||||
image_layer.path(),
|
all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
|
||||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
par_fsync::par_fsync(&all_paths)?;
|
||||||
])?;
|
|
||||||
|
|
||||||
// FIXME: Do we need to do something to upload it to remote storage here?
|
|
||||||
|
|
||||||
let mut layers = self.layers.write().unwrap();
|
let mut layers = self.layers.write().unwrap();
|
||||||
let new_path = image_layer.path();
|
for l in image_layers {
|
||||||
layers.insert_historic(Arc::new(image_layer));
|
layers.insert_historic(Arc::new(l));
|
||||||
|
}
|
||||||
drop(layers);
|
drop(layers);
|
||||||
|
timer.stop_and_record();
|
||||||
|
|
||||||
Ok(new_path)
|
Ok(layer_paths_to_upload)
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -2210,6 +2332,9 @@ impl LayeredTimeline {
|
|||||||
LsnForTimestamp::Past(lsn) => {
|
LsnForTimestamp::Past(lsn) => {
|
||||||
debug!("past({})", lsn);
|
debug!("past({})", lsn);
|
||||||
}
|
}
|
||||||
|
LsnForTimestamp::NoData(lsn) => {
|
||||||
|
debug!("nodata({})", lsn);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
|
debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
|
||||||
}
|
}
|
||||||
@@ -2483,7 +2608,7 @@ impl Deref for LayeredTimelineWriter<'_> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> {
|
impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> {
|
||||||
fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()> {
|
fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> {
|
||||||
self.tl.put_value(key, lsn, value)
|
self.tl.put_value(key, lsn, value)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2625,7 +2750,7 @@ pub mod tests {
|
|||||||
let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
|
let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
|
||||||
|
|
||||||
let writer = tline.writer();
|
let writer = tline.writer();
|
||||||
writer.put(TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?;
|
writer.put(TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||||
writer.finish_write(Lsn(0x10));
|
writer.finish_write(Lsn(0x10));
|
||||||
drop(writer);
|
drop(writer);
|
||||||
|
|
||||||
@@ -2633,7 +2758,7 @@ pub mod tests {
|
|||||||
tline.compact()?;
|
tline.compact()?;
|
||||||
|
|
||||||
let writer = tline.writer();
|
let writer = tline.writer();
|
||||||
writer.put(TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?;
|
writer.put(TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||||
writer.finish_write(Lsn(0x20));
|
writer.finish_write(Lsn(0x20));
|
||||||
drop(writer);
|
drop(writer);
|
||||||
|
|
||||||
@@ -2641,7 +2766,7 @@ pub mod tests {
|
|||||||
tline.compact()?;
|
tline.compact()?;
|
||||||
|
|
||||||
let writer = tline.writer();
|
let writer = tline.writer();
|
||||||
writer.put(TEST_KEY, Lsn(0x30), Value::Image(TEST_IMG("foo at 0x30")))?;
|
writer.put(TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
|
||||||
writer.finish_write(Lsn(0x30));
|
writer.finish_write(Lsn(0x30));
|
||||||
drop(writer);
|
drop(writer);
|
||||||
|
|
||||||
@@ -2649,7 +2774,7 @@ pub mod tests {
|
|||||||
tline.compact()?;
|
tline.compact()?;
|
||||||
|
|
||||||
let writer = tline.writer();
|
let writer = tline.writer();
|
||||||
writer.put(TEST_KEY, Lsn(0x40), Value::Image(TEST_IMG("foo at 0x40")))?;
|
writer.put(TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
|
||||||
writer.finish_write(Lsn(0x40));
|
writer.finish_write(Lsn(0x40));
|
||||||
drop(writer);
|
drop(writer);
|
||||||
|
|
||||||
@@ -2687,7 +2812,7 @@ pub mod tests {
|
|||||||
writer.put(
|
writer.put(
|
||||||
test_key,
|
test_key,
|
||||||
lsn,
|
lsn,
|
||||||
Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||||
)?;
|
)?;
|
||||||
writer.finish_write(lsn);
|
writer.finish_write(lsn);
|
||||||
drop(writer);
|
drop(writer);
|
||||||
@@ -2733,7 +2858,7 @@ pub mod tests {
|
|||||||
writer.put(
|
writer.put(
|
||||||
test_key,
|
test_key,
|
||||||
lsn,
|
lsn,
|
||||||
Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||||
)?;
|
)?;
|
||||||
writer.finish_write(lsn);
|
writer.finish_write(lsn);
|
||||||
updated[blknum] = lsn;
|
updated[blknum] = lsn;
|
||||||
@@ -2751,7 +2876,7 @@ pub mod tests {
|
|||||||
writer.put(
|
writer.put(
|
||||||
test_key,
|
test_key,
|
||||||
lsn,
|
lsn,
|
||||||
Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||||
)?;
|
)?;
|
||||||
writer.finish_write(lsn);
|
writer.finish_write(lsn);
|
||||||
drop(writer);
|
drop(writer);
|
||||||
@@ -2803,7 +2928,7 @@ pub mod tests {
|
|||||||
writer.put(
|
writer.put(
|
||||||
test_key,
|
test_key,
|
||||||
lsn,
|
lsn,
|
||||||
Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||||
)?;
|
)?;
|
||||||
writer.finish_write(lsn);
|
writer.finish_write(lsn);
|
||||||
updated[blknum] = lsn;
|
updated[blknum] = lsn;
|
||||||
@@ -2815,7 +2940,7 @@ pub mod tests {
|
|||||||
let mut tline_id = TIMELINE_ID;
|
let mut tline_id = TIMELINE_ID;
|
||||||
for _ in 0..50 {
|
for _ in 0..50 {
|
||||||
let new_tline_id = ZTimelineId::generate();
|
let new_tline_id = ZTimelineId::generate();
|
||||||
repo.branch_timeline(tline_id, new_tline_id, lsn)?;
|
repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
|
||||||
tline = repo.get_timeline_load(new_tline_id)?;
|
tline = repo.get_timeline_load(new_tline_id)?;
|
||||||
tline_id = new_tline_id;
|
tline_id = new_tline_id;
|
||||||
|
|
||||||
@@ -2827,7 +2952,7 @@ pub mod tests {
|
|||||||
writer.put(
|
writer.put(
|
||||||
test_key,
|
test_key,
|
||||||
lsn,
|
lsn,
|
||||||
Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||||
)?;
|
)?;
|
||||||
println!("updating {} at {}", blknum, lsn);
|
println!("updating {} at {}", blknum, lsn);
|
||||||
writer.finish_write(lsn);
|
writer.finish_write(lsn);
|
||||||
@@ -2874,7 +2999,7 @@ pub mod tests {
|
|||||||
#[allow(clippy::needless_range_loop)]
|
#[allow(clippy::needless_range_loop)]
|
||||||
for idx in 0..NUM_TLINES {
|
for idx in 0..NUM_TLINES {
|
||||||
let new_tline_id = ZTimelineId::generate();
|
let new_tline_id = ZTimelineId::generate();
|
||||||
repo.branch_timeline(tline_id, new_tline_id, lsn)?;
|
repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
|
||||||
tline = repo.get_timeline_load(new_tline_id)?;
|
tline = repo.get_timeline_load(new_tline_id)?;
|
||||||
tline_id = new_tline_id;
|
tline_id = new_tline_id;
|
||||||
|
|
||||||
@@ -2886,7 +3011,7 @@ pub mod tests {
|
|||||||
writer.put(
|
writer.put(
|
||||||
test_key,
|
test_key,
|
||||||
lsn,
|
lsn,
|
||||||
Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
|
&Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
|
||||||
)?;
|
)?;
|
||||||
println!("updating [{}][{}] at {}", idx, blknum, lsn);
|
println!("updating [{}][{}] at {}", idx, blknum, lsn);
|
||||||
writer.finish_write(lsn);
|
writer.finish_write(lsn);
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and
|
|||||||
a range of LSNs (or a single LSN, in case of image layers). You can think of it
|
a range of LSNs (or a single LSN, in case of image layers). You can think of it
|
||||||
as a rectangle in the two-dimensional key-LSN space. The layer files for each
|
as a rectangle in the two-dimensional key-LSN space. The layer files for each
|
||||||
timeline are stored in the timeline's subdirectory under
|
timeline are stored in the timeline's subdirectory under
|
||||||
`.zenith/tenants/<tenantid>/timelines`.
|
`.neon/tenants/<tenantid>/timelines`.
|
||||||
|
|
||||||
There are two kind of layer files: images, and delta layers. An image file
|
There are two kind of layer files: images, and delta layers. An image file
|
||||||
contains a snapshot of all keys at a particular LSN, whereas a delta file
|
contains a snapshot of all keys at a particular LSN, whereas a delta file
|
||||||
@@ -178,7 +178,7 @@ version, and how branching and GC works is still valid.
|
|||||||
The full path of a delta file looks like this:
|
The full path of a delta file looks like this:
|
||||||
|
|
||||||
```
|
```
|
||||||
.zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
|
.neon/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
|
||||||
```
|
```
|
||||||
|
|
||||||
For simplicity, the examples below use a simplified notation for the
|
For simplicity, the examples below use a simplified notation for the
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ pub trait BlobCursor {
|
|||||||
) -> Result<(), std::io::Error>;
|
) -> Result<(), std::io::Error>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, R> BlobCursor for BlockCursor<R>
|
impl<R> BlobCursor for BlockCursor<R>
|
||||||
where
|
where
|
||||||
R: BlockReader,
|
R: BlockReader,
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -445,7 +445,10 @@ impl ImageLayerWriter {
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
info!("new image layer {}", path.display());
|
info!("new image layer {}", path.display());
|
||||||
let mut file = VirtualFile::create(&path)?;
|
let mut file = VirtualFile::open_with_options(
|
||||||
|
&path,
|
||||||
|
std::fs::OpenOptions::new().write(true).create_new(true),
|
||||||
|
)?;
|
||||||
// make room for the header block
|
// make room for the header block
|
||||||
file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
|
file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
|
||||||
let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);
|
let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);
|
||||||
|
|||||||
@@ -267,13 +267,13 @@ impl InMemoryLayer {
|
|||||||
|
|
||||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||||
/// Adds the page version to the in-memory tree
|
/// Adds the page version to the in-memory tree
|
||||||
pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||||
trace!("put_value key {} at {}/{}", key, self.timelineid, lsn);
|
trace!("put_value key {} at {}/{}", key, self.timelineid, lsn);
|
||||||
let mut inner = self.inner.write().unwrap();
|
let mut inner = self.inner.write().unwrap();
|
||||||
|
|
||||||
inner.assert_writeable();
|
inner.assert_writeable();
|
||||||
|
|
||||||
let off = inner.file.write_blob(&Value::ser(&val)?)?;
|
let off = inner.file.write_blob(&Value::ser(val)?)?;
|
||||||
|
|
||||||
let vec_map = inner.index.entry(key).or_default();
|
let vec_map = inner.index.entry(key).or_default();
|
||||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ pub mod repository;
|
|||||||
pub mod storage_sync;
|
pub mod storage_sync;
|
||||||
pub mod tenant_config;
|
pub mod tenant_config;
|
||||||
pub mod tenant_mgr;
|
pub mod tenant_mgr;
|
||||||
pub mod tenant_threads;
|
pub mod tenant_tasks;
|
||||||
pub mod thread_mgr;
|
pub mod thread_mgr;
|
||||||
pub mod timelines;
|
pub mod timelines;
|
||||||
pub mod virtual_file;
|
pub mod virtual_file;
|
||||||
@@ -24,7 +24,6 @@ pub mod walredo;
|
|||||||
|
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
use utils::postgres_backend;
|
|
||||||
|
|
||||||
use crate::thread_mgr::ThreadKind;
|
use crate::thread_mgr::ThreadKind;
|
||||||
use metrics::{register_int_gauge_vec, IntGaugeVec};
|
use metrics::{register_int_gauge_vec, IntGaugeVec};
|
||||||
@@ -73,7 +72,6 @@ pub fn shutdown_pageserver(exit_code: i32) {
|
|||||||
thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);
|
thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);
|
||||||
|
|
||||||
// Shut down any page service threads.
|
// Shut down any page service threads.
|
||||||
postgres_backend::set_pgbackend_shutdown_requested();
|
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None);
|
thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None);
|
||||||
|
|
||||||
// Shut down all the tenants. This flushes everything to disk and kills
|
// Shut down all the tenants. This flushes everything to disk and kills
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ use anyhow::{bail, ensure, Context, Result};
|
|||||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use std::io;
|
use std::io::{self, Read};
|
||||||
use std::net::TcpListener;
|
use std::net::TcpListener;
|
||||||
use std::str;
|
use std::str;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
@@ -29,6 +29,8 @@ use utils::{
|
|||||||
|
|
||||||
use crate::basebackup;
|
use crate::basebackup;
|
||||||
use crate::config::{PageServerConf, ProfilingConfig};
|
use crate::config::{PageServerConf, ProfilingConfig};
|
||||||
|
use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
|
||||||
|
use crate::layered_repository::LayeredRepository;
|
||||||
use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp};
|
use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp};
|
||||||
use crate::profiling::profpoint_start;
|
use crate::profiling::profpoint_start;
|
||||||
use crate::reltag::RelTag;
|
use crate::reltag::RelTag;
|
||||||
@@ -200,6 +202,96 @@ impl PagestreamBeMessage {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Implements Read for the server side of CopyIn
|
||||||
|
struct CopyInReader<'a> {
|
||||||
|
pgb: &'a mut PostgresBackend,
|
||||||
|
|
||||||
|
/// Overflow buffer for bytes sent in CopyData messages
|
||||||
|
/// that the reader (caller of read) hasn't asked for yet.
|
||||||
|
/// TODO use BytesMut?
|
||||||
|
buf: Vec<u8>,
|
||||||
|
|
||||||
|
/// Bytes before `buf_begin` are considered as dropped.
|
||||||
|
/// This allows us to implement O(1) pop_front on Vec<u8>.
|
||||||
|
/// The Vec won't grow large because we only add to it
|
||||||
|
/// when it's empty.
|
||||||
|
buf_begin: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> CopyInReader<'a> {
|
||||||
|
// NOTE: pgb should be in copy in state already
|
||||||
|
fn new(pgb: &'a mut PostgresBackend) -> Self {
|
||||||
|
Self {
|
||||||
|
pgb,
|
||||||
|
buf: Vec::<_>::new(),
|
||||||
|
buf_begin: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Drop for CopyInReader<'a> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
// Finalize copy protocol so that self.pgb can be reused
|
||||||
|
// TODO instead, maybe take ownership of pgb and give it back at the end
|
||||||
|
let mut buf: Vec<u8> = vec![];
|
||||||
|
let _ = self.read_to_end(&mut buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Read for CopyInReader<'a> {
|
||||||
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||||
|
while !thread_mgr::is_shutdown_requested() {
|
||||||
|
// Return from buffer if nonempty
|
||||||
|
if self.buf_begin < self.buf.len() {
|
||||||
|
let bytes_to_read = std::cmp::min(buf.len(), self.buf.len() - self.buf_begin);
|
||||||
|
buf[..bytes_to_read].copy_from_slice(&self.buf[self.buf_begin..][..bytes_to_read]);
|
||||||
|
self.buf_begin += bytes_to_read;
|
||||||
|
return Ok(bytes_to_read);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete garbage
|
||||||
|
self.buf.clear();
|
||||||
|
self.buf_begin = 0;
|
||||||
|
|
||||||
|
// Wait for client to send CopyData bytes
|
||||||
|
match self.pgb.read_message() {
|
||||||
|
Ok(Some(message)) => {
|
||||||
|
let copy_data_bytes = match message {
|
||||||
|
FeMessage::CopyData(bytes) => bytes,
|
||||||
|
FeMessage::CopyDone => return Ok(0),
|
||||||
|
FeMessage::Sync => continue,
|
||||||
|
m => {
|
||||||
|
let msg = format!("unexpected message {:?}", m);
|
||||||
|
self.pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
|
||||||
|
return Err(io::Error::new(io::ErrorKind::Other, msg));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Return as much as we can, saving the rest in self.buf
|
||||||
|
let mut reader = copy_data_bytes.reader();
|
||||||
|
let bytes_read = reader.read(buf)?;
|
||||||
|
reader.read_to_end(&mut self.buf)?;
|
||||||
|
return Ok(bytes_read);
|
||||||
|
}
|
||||||
|
Ok(None) => {
|
||||||
|
let msg = "client closed connection";
|
||||||
|
self.pgb.write_message(&BeMessage::ErrorResponse(msg))?;
|
||||||
|
return Err(io::Error::new(io::ErrorKind::Other, msg));
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
if !is_socket_read_timed_out(&e) {
|
||||||
|
return Err(io::Error::new(io::ErrorKind::Other, e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shutting down
|
||||||
|
let msg = "Importer thread was shut down";
|
||||||
|
Err(io::Error::new(io::ErrorKind::Other, msg))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -370,6 +462,10 @@ impl PageServerHandler {
|
|||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
|
let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
|
||||||
|
|
||||||
|
// NOTE: pagerequests handler exits when connection is closed,
|
||||||
|
// so there is no need to reset the association
|
||||||
|
thread_mgr::associate_with(Some(tenantid), Some(timelineid));
|
||||||
|
|
||||||
// Check that the timeline exists
|
// Check that the timeline exists
|
||||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||||
.context("Cannot load local timeline")?;
|
.context("Cannot load local timeline")?;
|
||||||
@@ -443,6 +539,98 @@ impl PageServerHandler {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn handle_import_basebackup(
|
||||||
|
&self,
|
||||||
|
pgb: &mut PostgresBackend,
|
||||||
|
tenant_id: ZTenantId,
|
||||||
|
timeline_id: ZTimelineId,
|
||||||
|
base_lsn: Lsn,
|
||||||
|
_end_lsn: Lsn,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
thread_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||||
|
let _enter =
|
||||||
|
info_span!("import basebackup", timeline = %timeline_id, tenant = %tenant_id).entered();
|
||||||
|
|
||||||
|
// Create empty timeline
|
||||||
|
info!("creating new timeline");
|
||||||
|
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||||
|
let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?;
|
||||||
|
let repartition_distance = repo.get_checkpoint_distance();
|
||||||
|
let mut datadir_timeline =
|
||||||
|
DatadirTimeline::<LayeredRepository>::new(timeline, repartition_distance);
|
||||||
|
|
||||||
|
// TODO mark timeline as not ready until it reaches end_lsn.
|
||||||
|
// We might have some wal to import as well, and we should prevent compute
|
||||||
|
// from connecting before that and writing conflicting wal.
|
||||||
|
//
|
||||||
|
// This is not relevant for pageserver->pageserver migrations, since there's
|
||||||
|
// no wal to import. But should be fixed if we want to import from postgres.
|
||||||
|
|
||||||
|
// TODO leave clean state on error. For now you can use detach to clean
|
||||||
|
// up broken state from a failed import.
|
||||||
|
|
||||||
|
// Import basebackup provided via CopyData
|
||||||
|
info!("importing basebackup");
|
||||||
|
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||||
|
let reader = CopyInReader::new(pgb);
|
||||||
|
import_basebackup_from_tar(&mut datadir_timeline, reader, base_lsn)?;
|
||||||
|
|
||||||
|
// TODO check checksum
|
||||||
|
// Meanwhile you can verify client-side by taking fullbackup
|
||||||
|
// and checking that it matches in size with what was imported.
|
||||||
|
// It wouldn't work if base came from vanilla postgres though,
|
||||||
|
// since we discard some log files.
|
||||||
|
|
||||||
|
// Flush data to disk, then upload to s3
|
||||||
|
info!("flushing layers");
|
||||||
|
datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?;
|
||||||
|
|
||||||
|
info!("done");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_import_wal(
|
||||||
|
&self,
|
||||||
|
pgb: &mut PostgresBackend,
|
||||||
|
tenant_id: ZTenantId,
|
||||||
|
timeline_id: ZTimelineId,
|
||||||
|
start_lsn: Lsn,
|
||||||
|
end_lsn: Lsn,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
thread_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||||
|
let _enter =
|
||||||
|
info_span!("import wal", timeline = %timeline_id, tenant = %tenant_id).entered();
|
||||||
|
|
||||||
|
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||||
|
let timeline = repo.get_timeline_load(timeline_id)?;
|
||||||
|
ensure!(timeline.get_last_record_lsn() == start_lsn);
|
||||||
|
|
||||||
|
let repartition_distance = repo.get_checkpoint_distance();
|
||||||
|
let mut datadir_timeline =
|
||||||
|
DatadirTimeline::<LayeredRepository>::new(timeline, repartition_distance);
|
||||||
|
|
||||||
|
// TODO leave clean state on error. For now you can use detach to clean
|
||||||
|
// up broken state from a failed import.
|
||||||
|
|
||||||
|
// Import wal provided via CopyData
|
||||||
|
info!("importing wal");
|
||||||
|
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||||
|
let reader = CopyInReader::new(pgb);
|
||||||
|
import_wal_from_tar(&mut datadir_timeline, reader, start_lsn, end_lsn)?;
|
||||||
|
|
||||||
|
// TODO Does it make sense to overshoot?
|
||||||
|
ensure!(datadir_timeline.tline.get_last_record_lsn() >= end_lsn);
|
||||||
|
|
||||||
|
// Flush data to disk, then upload to s3. No need for a forced checkpoint.
|
||||||
|
// We only want to persist the data, and it doesn't matter if it's in the
|
||||||
|
// shape of deltas or images.
|
||||||
|
info!("flushing layers");
|
||||||
|
datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?;
|
||||||
|
|
||||||
|
info!("done");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Helper function to handle the LSN from client request.
|
/// Helper function to handle the LSN from client request.
|
||||||
///
|
///
|
||||||
/// Each GetPage (and Exists and Nblocks) request includes information about
|
/// Each GetPage (and Exists and Nblocks) request includes information about
|
||||||
@@ -545,17 +733,10 @@ impl PageServerHandler {
|
|||||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
||||||
|
|
||||||
let all_rels = timeline.list_rels(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?;
|
let total_blocks =
|
||||||
let mut total_blocks: i64 = 0;
|
timeline.get_db_size(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?;
|
||||||
|
|
||||||
for rel in all_rels {
|
let db_size = total_blocks as i64 * pg_constants::BLCKSZ as i64;
|
||||||
if rel.forknum == 0 {
|
|
||||||
let n_blocks = timeline.get_rel_size(rel, lsn).unwrap_or(0);
|
|
||||||
total_blocks += n_blocks as i64;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let db_size = total_blocks * pg_constants::BLCKSZ as i64;
|
|
||||||
|
|
||||||
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
|
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
|
||||||
db_size,
|
db_size,
|
||||||
@@ -591,7 +772,9 @@ impl PageServerHandler {
|
|||||||
pgb: &mut PostgresBackend,
|
pgb: &mut PostgresBackend,
|
||||||
timelineid: ZTimelineId,
|
timelineid: ZTimelineId,
|
||||||
lsn: Option<Lsn>,
|
lsn: Option<Lsn>,
|
||||||
|
prev_lsn: Option<Lsn>,
|
||||||
tenantid: ZTenantId,
|
tenantid: ZTenantId,
|
||||||
|
full_backup: bool,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
|
let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
|
||||||
let _enter = span.enter();
|
let _enter = span.enter();
|
||||||
@@ -614,7 +797,8 @@ impl PageServerHandler {
|
|||||||
{
|
{
|
||||||
let mut writer = CopyDataSink { pgb };
|
let mut writer = CopyDataSink { pgb };
|
||||||
|
|
||||||
let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
|
let basebackup =
|
||||||
|
basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
|
||||||
span.record("lsn", &basebackup.lsn.to_string().as_str());
|
span.record("lsn", &basebackup.lsn.to_string().as_str());
|
||||||
basebackup.send_tarball()?;
|
basebackup.send_tarball()?;
|
||||||
}
|
}
|
||||||
@@ -672,6 +856,10 @@ impl postgres_backend::Handler for PageServerHandler {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_shutdown_requested(&self) -> bool {
|
||||||
|
thread_mgr::is_shutdown_requested()
|
||||||
|
}
|
||||||
|
|
||||||
fn process_query(
|
fn process_query(
|
||||||
&mut self,
|
&mut self,
|
||||||
pgb: &mut PostgresBackend,
|
pgb: &mut PostgresBackend,
|
||||||
@@ -713,8 +901,119 @@ impl postgres_backend::Handler for PageServerHandler {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Check that the timeline exists
|
// Check that the timeline exists
|
||||||
self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?;
|
self.handle_basebackup_request(pgb, timelineid, lsn, None, tenantid, false)?;
|
||||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
|
}
|
||||||
|
// return pair of prev_lsn and last_lsn
|
||||||
|
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||||
|
let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
|
||||||
|
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||||
|
|
||||||
|
ensure!(
|
||||||
|
params.len() == 2,
|
||||||
|
"invalid param number for get_last_record_rlsn command"
|
||||||
|
);
|
||||||
|
|
||||||
|
let tenantid = ZTenantId::from_str(params[0])?;
|
||||||
|
let timelineid = ZTimelineId::from_str(params[1])?;
|
||||||
|
|
||||||
|
self.check_permission(Some(tenantid))?;
|
||||||
|
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||||
|
.context("Cannot load local timeline")?;
|
||||||
|
|
||||||
|
let end_of_timeline = timeline.tline.get_last_record_rlsn();
|
||||||
|
|
||||||
|
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||||
|
RowDescriptor::text_col(b"prev_lsn"),
|
||||||
|
RowDescriptor::text_col(b"last_lsn"),
|
||||||
|
]))?
|
||||||
|
.write_message_noflush(&BeMessage::DataRow(&[
|
||||||
|
Some(end_of_timeline.prev.to_string().as_bytes()),
|
||||||
|
Some(end_of_timeline.last.to_string().as_bytes()),
|
||||||
|
]))?
|
||||||
|
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
|
}
|
||||||
|
// same as basebackup, but result includes relational data as well
|
||||||
|
else if query_string.starts_with("fullbackup ") {
|
||||||
|
let (_, params_raw) = query_string.split_at("fullbackup ".len());
|
||||||
|
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||||
|
|
||||||
|
ensure!(
|
||||||
|
params.len() >= 2,
|
||||||
|
"invalid param number for fullbackup command"
|
||||||
|
);
|
||||||
|
|
||||||
|
let tenantid = ZTenantId::from_str(params[0])?;
|
||||||
|
let timelineid = ZTimelineId::from_str(params[1])?;
|
||||||
|
|
||||||
|
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||||
|
let lsn = if params.len() > 2 {
|
||||||
|
Some(Lsn::from_str(params[2])?)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
let prev_lsn = if params.len() > 3 {
|
||||||
|
Some(Lsn::from_str(params[3])?)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
self.check_permission(Some(tenantid))?;
|
||||||
|
|
||||||
|
// Check that the timeline exists
|
||||||
|
self.handle_basebackup_request(pgb, timelineid, lsn, prev_lsn, tenantid, true)?;
|
||||||
|
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
|
} else if query_string.starts_with("import basebackup ") {
|
||||||
|
// Import the `base` section (everything but the wal) of a basebackup.
|
||||||
|
// Assumes the tenant already exists on this pageserver.
|
||||||
|
//
|
||||||
|
// Files are scheduled to be persisted to remote storage, and the
|
||||||
|
// caller should poll the http api to check when that is done.
|
||||||
|
//
|
||||||
|
// Example import command:
|
||||||
|
// 1. Get start/end LSN from backup_manifest file
|
||||||
|
// 2. Run:
|
||||||
|
// cat my_backup/base.tar | psql -h $PAGESERVER \
|
||||||
|
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN"
|
||||||
|
let (_, params_raw) = query_string.split_at("import basebackup ".len());
|
||||||
|
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||||
|
ensure!(params.len() == 4);
|
||||||
|
let tenant = ZTenantId::from_str(params[0])?;
|
||||||
|
let timeline = ZTimelineId::from_str(params[1])?;
|
||||||
|
let base_lsn = Lsn::from_str(params[2])?;
|
||||||
|
let end_lsn = Lsn::from_str(params[3])?;
|
||||||
|
|
||||||
|
self.check_permission(Some(tenant))?;
|
||||||
|
|
||||||
|
match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) {
|
||||||
|
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||||
|
Err(e) => {
|
||||||
|
error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
|
||||||
|
pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} else if query_string.starts_with("import wal ") {
|
||||||
|
// Import the `pg_wal` section of a basebackup.
|
||||||
|
//
|
||||||
|
// Files are scheduled to be persisted to remote storage, and the
|
||||||
|
// caller should poll the http api to check when that is done.
|
||||||
|
let (_, params_raw) = query_string.split_at("import wal ".len());
|
||||||
|
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||||
|
ensure!(params.len() == 4);
|
||||||
|
let tenant = ZTenantId::from_str(params[0])?;
|
||||||
|
let timeline = ZTimelineId::from_str(params[1])?;
|
||||||
|
let start_lsn = Lsn::from_str(params[2])?;
|
||||||
|
let end_lsn = Lsn::from_str(params[3])?;
|
||||||
|
|
||||||
|
self.check_permission(Some(tenant))?;
|
||||||
|
|
||||||
|
match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) {
|
||||||
|
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||||
|
Err(e) => {
|
||||||
|
error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
|
||||||
|
pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?
|
||||||
|
}
|
||||||
|
};
|
||||||
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
||||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||||
// on connect
|
// on connect
|
||||||
@@ -802,7 +1101,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
|||||||
.map(|h| h.as_str().parse())
|
.map(|h| h.as_str().parse())
|
||||||
.unwrap_or_else(|| Ok(repo.get_gc_horizon()))?;
|
.unwrap_or_else(|| Ok(repo.get_gc_horizon()))?;
|
||||||
|
|
||||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
|
||||||
// Use tenant's pitr setting
|
// Use tenant's pitr setting
|
||||||
let pitr = repo.get_pitr_interval();
|
let pitr = repo.get_pitr_interval();
|
||||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?;
|
let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?;
|
||||||
@@ -895,6 +1193,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
|||||||
LsnForTimestamp::Present(lsn) => format!("{}", lsn),
|
LsnForTimestamp::Present(lsn) => format!("{}", lsn),
|
||||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||||
|
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||||
};
|
};
|
||||||
pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
|
pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
|
||||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ pub enum LsnForTimestamp {
|
|||||||
Present(Lsn),
|
Present(Lsn),
|
||||||
Future(Lsn),
|
Future(Lsn),
|
||||||
Past(Lsn),
|
Past(Lsn),
|
||||||
|
NoData(Lsn),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<R: Repository> DatadirTimeline<R> {
|
impl<R: Repository> DatadirTimeline<R> {
|
||||||
@@ -123,6 +124,19 @@ impl<R: Repository> DatadirTimeline<R> {
|
|||||||
self.tline.get(key, lsn)
|
self.tline.get(key, lsn)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get size of a database in blocks
|
||||||
|
pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<usize> {
|
||||||
|
let mut total_blocks = 0;
|
||||||
|
|
||||||
|
let rels = self.list_rels(spcnode, dbnode, lsn)?;
|
||||||
|
|
||||||
|
for rel in rels {
|
||||||
|
let n_blocks = self.get_rel_size(rel, lsn)?;
|
||||||
|
total_blocks += n_blocks as usize;
|
||||||
|
}
|
||||||
|
Ok(total_blocks)
|
||||||
|
}
|
||||||
|
|
||||||
/// Get size of a relation file
|
/// Get size of a relation file
|
||||||
pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
|
pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
|
||||||
ensure!(tag.relnode != 0, "invalid relnode");
|
ensure!(tag.relnode != 0, "invalid relnode");
|
||||||
@@ -250,7 +264,7 @@ impl<R: Repository> DatadirTimeline<R> {
|
|||||||
(false, false) => {
|
(false, false) => {
|
||||||
// This can happen if no commit records have been processed yet, e.g.
|
// This can happen if no commit records have been processed yet, e.g.
|
||||||
// just after importing a cluster.
|
// just after importing a cluster.
|
||||||
bail!("no commit timestamps found");
|
Ok(LsnForTimestamp::NoData(max_lsn))
|
||||||
}
|
}
|
||||||
(true, false) => {
|
(true, false) => {
|
||||||
// Didn't find any commit timestamps larger than the request
|
// Didn't find any commit timestamps larger than the request
|
||||||
@@ -667,6 +681,10 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> {
|
pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> {
|
||||||
|
let req_lsn = self.tline.get_last_record_lsn();
|
||||||
|
|
||||||
|
let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn)?;
|
||||||
|
|
||||||
// Remove entry from dbdir
|
// Remove entry from dbdir
|
||||||
let buf = self.get(DBDIR_KEY)?;
|
let buf = self.get(DBDIR_KEY)?;
|
||||||
let mut dir = DbDirectory::des(&buf)?;
|
let mut dir = DbDirectory::des(&buf)?;
|
||||||
@@ -680,7 +698,8 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: update pending_nblocks
|
// Update logical database size.
|
||||||
|
self.pending_nblocks -= total_blocks as isize;
|
||||||
|
|
||||||
// Delete all relations and metadata files for the spcnode/dnode
|
// Delete all relations and metadata files for the spcnode/dnode
|
||||||
self.delete(dbdir_key_range(spcnode, dbnode));
|
self.delete(dbdir_key_range(spcnode, dbnode));
|
||||||
@@ -749,6 +768,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Extend relation
|
/// Extend relation
|
||||||
|
/// If new size is smaller, do nothing.
|
||||||
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
|
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
|
||||||
ensure!(rel.relnode != 0, "invalid relnode");
|
ensure!(rel.relnode != 0, "invalid relnode");
|
||||||
|
|
||||||
@@ -756,10 +776,13 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
|||||||
let size_key = rel_size_to_key(rel);
|
let size_key = rel_size_to_key(rel);
|
||||||
let old_size = self.get(size_key)?.get_u32_le();
|
let old_size = self.get(size_key)?.get_u32_le();
|
||||||
|
|
||||||
let buf = nblocks.to_le_bytes();
|
// only extend relation here. never decrease the size
|
||||||
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
|
if nblocks > old_size {
|
||||||
|
let buf = nblocks.to_le_bytes();
|
||||||
|
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
|
||||||
|
|
||||||
self.pending_nblocks += nblocks as isize - old_size as isize;
|
self.pending_nblocks += nblocks as isize - old_size as isize;
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -879,6 +902,57 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Flush changes accumulated so far to the underlying repository.
|
||||||
|
///
|
||||||
|
/// Usually, changes made in DatadirModification are atomic, but this allows
|
||||||
|
/// you to flush them to the underlying repository before the final `commit`.
|
||||||
|
/// That allows to free up the memory used to hold the pending changes.
|
||||||
|
///
|
||||||
|
/// Currently only used during bulk import of a data directory. In that
|
||||||
|
/// context, breaking the atomicity is OK. If the import is interrupted, the
|
||||||
|
/// whole import fails and the timeline will be deleted anyway.
|
||||||
|
/// (Or to be precise, it will be left behind for debugging purposes and
|
||||||
|
/// ignored, see https://github.com/neondatabase/neon/pull/1809)
|
||||||
|
///
|
||||||
|
/// Note: A consequence of flushing the pending operations is that they
|
||||||
|
/// won't be visible to subsequent operations until `commit`. The function
|
||||||
|
/// retains all the metadata, but data pages are flushed. That's again OK
|
||||||
|
/// for bulk import, where you are just loading data pages and won't try to
|
||||||
|
/// modify the same pages twice.
|
||||||
|
pub fn flush(&mut self) -> Result<()> {
|
||||||
|
// Unless we have accumulated a decent amount of changes, it's not worth it
|
||||||
|
// to scan through the pending_updates list.
|
||||||
|
let pending_nblocks = self.pending_nblocks;
|
||||||
|
if pending_nblocks < 10000 {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let writer = self.tline.tline.writer();
|
||||||
|
|
||||||
|
// Flush relation and SLRU data blocks, keep metadata.
|
||||||
|
let mut result: Result<()> = Ok(());
|
||||||
|
self.pending_updates.retain(|&key, value| {
|
||||||
|
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
|
||||||
|
result = writer.put(key, self.lsn, value);
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
});
|
||||||
|
result?;
|
||||||
|
|
||||||
|
if pending_nblocks != 0 {
|
||||||
|
self.tline.current_logical_size.fetch_add(
|
||||||
|
pending_nblocks * pg_constants::BLCKSZ as isize,
|
||||||
|
Ordering::SeqCst,
|
||||||
|
);
|
||||||
|
self.pending_nblocks = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Finish this atomic update, writing all the updated keys to the
|
/// Finish this atomic update, writing all the updated keys to the
|
||||||
/// underlying timeline.
|
/// underlying timeline.
|
||||||
@@ -889,7 +963,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
|||||||
let pending_nblocks = self.pending_nblocks;
|
let pending_nblocks = self.pending_nblocks;
|
||||||
|
|
||||||
for (key, value) in self.pending_updates {
|
for (key, value) in self.pending_updates {
|
||||||
writer.put(key, self.lsn, value)?;
|
writer.put(key, self.lsn, &value)?;
|
||||||
}
|
}
|
||||||
for key_range in self.pending_deletions {
|
for key_range in self.pending_deletions {
|
||||||
writer.delete(key_range.clone(), self.lsn)?;
|
writer.delete(key_range.clone(), self.lsn)?;
|
||||||
@@ -1294,6 +1368,10 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_rel_block_key(key: Key) -> bool {
|
||||||
|
key.field1 == 0x00 && key.field4 != 0
|
||||||
|
}
|
||||||
|
|
||||||
pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
|
pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
|
||||||
Ok(match key.field1 {
|
Ok(match key.field1 {
|
||||||
0x01 => {
|
0x01 => {
|
||||||
@@ -1312,6 +1390,12 @@ pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_slru_block_key(key: Key) -> bool {
|
||||||
|
key.field1 == 0x01 // SLRU-related
|
||||||
|
&& key.field3 == 0x00000001 // but not SlruDir
|
||||||
|
&& key.field6 != 0xffffffff // and not SlruSegSize
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
//-- Tests that should work the same with any Repository/Timeline implementation.
|
//-- Tests that should work the same with any Repository/Timeline implementation.
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -81,6 +81,12 @@ mod profiling_impl {
|
|||||||
|
|
||||||
pub struct DummyProfilerGuard;
|
pub struct DummyProfilerGuard;
|
||||||
|
|
||||||
|
impl Drop for DummyProfilerGuard {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
// do nothing, this exists to calm Clippy down
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn profpoint_start(
|
pub fn profpoint_start(
|
||||||
_conf: &PageServerConf,
|
_conf: &PageServerConf,
|
||||||
_point: ProfilingConfig,
|
_point: ProfilingConfig,
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
|
|||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||||
use postgres_ffi::Oid;
|
use postgres_ffi::{pg_constants, Oid};
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Relation data file segment id throughout the Postgres cluster.
|
/// Relation data file segment id throughout the Postgres cluster.
|
||||||
@@ -75,6 +75,30 @@ impl fmt::Display for RelTag {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl RelTag {
|
||||||
|
pub fn to_segfile_name(&self, segno: u32) -> String {
|
||||||
|
let mut name = if self.spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||||
|
"global/".to_string()
|
||||||
|
} else {
|
||||||
|
format!("base/{}/", self.dbnode)
|
||||||
|
};
|
||||||
|
|
||||||
|
name += &self.relnode.to_string();
|
||||||
|
|
||||||
|
if let Some(fork_name) = forknumber_to_name(self.forknum) {
|
||||||
|
name += "_";
|
||||||
|
name += fork_name;
|
||||||
|
}
|
||||||
|
|
||||||
|
if segno != 0 {
|
||||||
|
name += ".";
|
||||||
|
name += &segno.to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and
|
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and
|
||||||
/// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer,
|
/// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer,
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ use byteorder::{ByteOrder, BE};
|
|||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::fmt::Display;
|
|
||||||
use std::ops::{AddAssign, Range};
|
use std::ops::{AddAssign, Range};
|
||||||
use std::sync::{Arc, RwLockReadGuard};
|
use std::sync::{Arc, RwLockReadGuard};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
@@ -182,33 +181,15 @@ impl Value {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy, Debug)]
|
|
||||||
pub enum TimelineSyncStatusUpdate {
|
|
||||||
Downloaded,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Display for TimelineSyncStatusUpdate {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
let s = match self {
|
|
||||||
TimelineSyncStatusUpdate::Downloaded => "Downloaded",
|
|
||||||
};
|
|
||||||
f.write_str(s)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// A repository corresponds to one .zenith directory. One repository holds multiple
|
/// A repository corresponds to one .neon directory. One repository holds multiple
|
||||||
/// timelines, forked off from the same initial call to 'initdb'.
|
/// timelines, forked off from the same initial call to 'initdb'.
|
||||||
pub trait Repository: Send + Sync {
|
pub trait Repository: Send + Sync {
|
||||||
type Timeline: Timeline;
|
type Timeline: Timeline;
|
||||||
|
|
||||||
/// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization.
|
/// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization.
|
||||||
/// See [`crate::remote_storage`] for more details about the synchronization.
|
/// See [`crate::remote_storage`] for more details about the synchronization.
|
||||||
fn apply_timeline_remote_sync_status_update(
|
fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
|
||||||
&self,
|
|
||||||
timeline_id: ZTimelineId,
|
|
||||||
timeline_sync_status_update: TimelineSyncStatusUpdate,
|
|
||||||
) -> Result<()>;
|
|
||||||
|
|
||||||
/// Get Timeline handle for given zenith timeline ID.
|
/// Get Timeline handle for given zenith timeline ID.
|
||||||
/// This function is idempotent. It doesn't change internal state in any way.
|
/// This function is idempotent. It doesn't change internal state in any way.
|
||||||
@@ -225,12 +206,17 @@ pub trait Repository: Send + Sync {
|
|||||||
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
|
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
|
||||||
fn create_empty_timeline(
|
fn create_empty_timeline(
|
||||||
&self,
|
&self,
|
||||||
timelineid: ZTimelineId,
|
timeline_id: ZTimelineId,
|
||||||
initdb_lsn: Lsn,
|
initdb_lsn: Lsn,
|
||||||
) -> Result<Arc<Self::Timeline>>;
|
) -> Result<Arc<Self::Timeline>>;
|
||||||
|
|
||||||
/// Branch a timeline
|
/// Branch a timeline
|
||||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
|
fn branch_timeline(
|
||||||
|
&self,
|
||||||
|
src: ZTimelineId,
|
||||||
|
dst: ZTimelineId,
|
||||||
|
start_lsn: Option<Lsn>,
|
||||||
|
) -> Result<()>;
|
||||||
|
|
||||||
/// Flush all data to disk.
|
/// Flush all data to disk.
|
||||||
///
|
///
|
||||||
@@ -260,10 +246,10 @@ pub trait Repository: Send + Sync {
|
|||||||
/// api's 'compact' command.
|
/// api's 'compact' command.
|
||||||
fn compaction_iteration(&self) -> Result<()>;
|
fn compaction_iteration(&self) -> Result<()>;
|
||||||
|
|
||||||
/// detaches timeline-related in-memory data.
|
/// removes timeline-related in-memory data
|
||||||
fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
|
fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()>;
|
||||||
|
|
||||||
// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
|
/// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
|
||||||
fn get_remote_index(&self) -> &RemoteIndex;
|
fn get_remote_index(&self) -> &RemoteIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -407,7 +393,7 @@ pub trait TimelineWriter<'a> {
|
|||||||
///
|
///
|
||||||
/// This will implicitly extend the relation, if the page is beyond the
|
/// This will implicitly extend the relation, if the page is beyond the
|
||||||
/// current end-of-file.
|
/// current end-of-file.
|
||||||
fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()>;
|
fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()>;
|
||||||
|
|
||||||
fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()>;
|
fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()>;
|
||||||
|
|
||||||
@@ -537,7 +523,7 @@ pub mod repo_harness {
|
|||||||
TenantConfOpt::from(self.tenant_conf),
|
TenantConfOpt::from(self.tenant_conf),
|
||||||
walredo_mgr,
|
walredo_mgr,
|
||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
RemoteIndex::empty(),
|
RemoteIndex::default(),
|
||||||
false,
|
false,
|
||||||
);
|
);
|
||||||
// populate repo with locally available timelines
|
// populate repo with locally available timelines
|
||||||
@@ -553,10 +539,7 @@ pub mod repo_harness {
|
|||||||
.parse()
|
.parse()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
repo.apply_timeline_remote_sync_status_update(
|
repo.attach_timeline(timeline_id)?;
|
||||||
timeline_id,
|
|
||||||
TimelineSyncStatusUpdate::Downloaded,
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(repo)
|
Ok(repo)
|
||||||
@@ -620,12 +603,12 @@ mod tests {
|
|||||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||||
|
|
||||||
let writer = tline.writer();
|
let writer = tline.writer();
|
||||||
writer.put(*TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?;
|
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||||
writer.finish_write(Lsn(0x10));
|
writer.finish_write(Lsn(0x10));
|
||||||
drop(writer);
|
drop(writer);
|
||||||
|
|
||||||
let writer = tline.writer();
|
let writer = tline.writer();
|
||||||
writer.put(*TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?;
|
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||||
writer.finish_write(Lsn(0x20));
|
writer.finish_write(Lsn(0x20));
|
||||||
drop(writer);
|
drop(writer);
|
||||||
|
|
||||||
@@ -636,6 +619,19 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn no_duplicate_timelines() -> Result<()> {
|
||||||
|
let repo = RepoHarness::create("no_duplicate_timelines")?.load();
|
||||||
|
let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||||
|
|
||||||
|
match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) {
|
||||||
|
Ok(_) => panic!("duplicate timeline creation should fail"),
|
||||||
|
Err(e) => assert_eq!(e.to_string(), "Timeline already exists"),
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Convenience function to create a page image with given string as the only content
|
/// Convenience function to create a page image with given string as the only content
|
||||||
pub fn test_value(s: &str) -> Value {
|
pub fn test_value(s: &str) -> Value {
|
||||||
let mut buf = BytesMut::new();
|
let mut buf = BytesMut::new();
|
||||||
@@ -659,24 +655,24 @@ mod tests {
|
|||||||
let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
|
let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
|
||||||
|
|
||||||
// Insert a value on the timeline
|
// Insert a value on the timeline
|
||||||
writer.put(TEST_KEY_A, Lsn(0x20), test_value("foo at 0x20"))?;
|
writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?;
|
||||||
writer.put(TEST_KEY_B, Lsn(0x20), test_value("foobar at 0x20"))?;
|
writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?;
|
||||||
writer.finish_write(Lsn(0x20));
|
writer.finish_write(Lsn(0x20));
|
||||||
|
|
||||||
writer.put(TEST_KEY_A, Lsn(0x30), test_value("foo at 0x30"))?;
|
writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?;
|
||||||
writer.finish_write(Lsn(0x30));
|
writer.finish_write(Lsn(0x30));
|
||||||
writer.put(TEST_KEY_A, Lsn(0x40), test_value("foo at 0x40"))?;
|
writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?;
|
||||||
writer.finish_write(Lsn(0x40));
|
writer.finish_write(Lsn(0x40));
|
||||||
|
|
||||||
//assert_current_logical_size(&tline, Lsn(0x40));
|
//assert_current_logical_size(&tline, Lsn(0x40));
|
||||||
|
|
||||||
// Branch the history, modify relation differently on the new timeline
|
// Branch the history, modify relation differently on the new timeline
|
||||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
|
||||||
let newtline = repo
|
let newtline = repo
|
||||||
.get_timeline_load(NEW_TIMELINE_ID)
|
.get_timeline_load(NEW_TIMELINE_ID)
|
||||||
.expect("Should have a local timeline");
|
.expect("Should have a local timeline");
|
||||||
let new_writer = newtline.writer();
|
let new_writer = newtline.writer();
|
||||||
new_writer.put(TEST_KEY_A, Lsn(0x40), test_value("bar at 0x40"))?;
|
new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?;
|
||||||
new_writer.finish_write(Lsn(0x40));
|
new_writer.finish_write(Lsn(0x40));
|
||||||
|
|
||||||
// Check page contents on both branches
|
// Check page contents on both branches
|
||||||
@@ -707,14 +703,14 @@ mod tests {
|
|||||||
writer.put(
|
writer.put(
|
||||||
*TEST_KEY,
|
*TEST_KEY,
|
||||||
lsn,
|
lsn,
|
||||||
Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||||
)?;
|
)?;
|
||||||
writer.finish_write(lsn);
|
writer.finish_write(lsn);
|
||||||
lsn += 0x10;
|
lsn += 0x10;
|
||||||
writer.put(
|
writer.put(
|
||||||
*TEST_KEY,
|
*TEST_KEY,
|
||||||
lsn,
|
lsn,
|
||||||
Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||||
)?;
|
)?;
|
||||||
writer.finish_write(lsn);
|
writer.finish_write(lsn);
|
||||||
lsn += 0x10;
|
lsn += 0x10;
|
||||||
@@ -725,14 +721,14 @@ mod tests {
|
|||||||
writer.put(
|
writer.put(
|
||||||
*TEST_KEY,
|
*TEST_KEY,
|
||||||
lsn,
|
lsn,
|
||||||
Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||||
)?;
|
)?;
|
||||||
writer.finish_write(lsn);
|
writer.finish_write(lsn);
|
||||||
lsn += 0x10;
|
lsn += 0x10;
|
||||||
writer.put(
|
writer.put(
|
||||||
*TEST_KEY,
|
*TEST_KEY,
|
||||||
lsn,
|
lsn,
|
||||||
Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||||
)?;
|
)?;
|
||||||
writer.finish_write(lsn);
|
writer.finish_write(lsn);
|
||||||
}
|
}
|
||||||
@@ -753,7 +749,7 @@ mod tests {
|
|||||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||||
|
|
||||||
// try to branch at lsn 25, should fail because we already garbage collected the data
|
// try to branch at lsn 25, should fail because we already garbage collected the data
|
||||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
|
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
|
||||||
Ok(_) => panic!("branching should have failed"),
|
Ok(_) => panic!("branching should have failed"),
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
assert!(err.to_string().contains("invalid branch start lsn"));
|
assert!(err.to_string().contains("invalid branch start lsn"));
|
||||||
@@ -774,7 +770,7 @@ mod tests {
|
|||||||
|
|
||||||
repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
|
repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
|
||||||
// try to branch at lsn 0x25, should fail because initdb lsn is 0x50
|
// try to branch at lsn 0x25, should fail because initdb lsn is 0x50
|
||||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
|
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
|
||||||
Ok(_) => panic!("branching should have failed"),
|
Ok(_) => panic!("branching should have failed"),
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
assert!(&err.to_string().contains("invalid branch start lsn"));
|
assert!(&err.to_string().contains("invalid branch start lsn"));
|
||||||
@@ -819,7 +815,7 @@ mod tests {
|
|||||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||||
|
|
||||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
|
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||||
let newtline = repo
|
let newtline = repo
|
||||||
.get_timeline_load(NEW_TIMELINE_ID)
|
.get_timeline_load(NEW_TIMELINE_ID)
|
||||||
.expect("Should have a local timeline");
|
.expect("Should have a local timeline");
|
||||||
@@ -835,7 +831,7 @@ mod tests {
|
|||||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||||
|
|
||||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
|
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||||
let newtline = repo
|
let newtline = repo
|
||||||
.get_timeline_load(NEW_TIMELINE_ID)
|
.get_timeline_load(NEW_TIMELINE_ID)
|
||||||
.expect("Should have a local timeline");
|
.expect("Should have a local timeline");
|
||||||
@@ -893,7 +889,7 @@ mod tests {
|
|||||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||||
|
|
||||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
|
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||||
|
|
||||||
let newtline = repo
|
let newtline = repo
|
||||||
.get_timeline_load(NEW_TIMELINE_ID)
|
.get_timeline_load(NEW_TIMELINE_ID)
|
||||||
|
|||||||
@@ -178,20 +178,20 @@ use crate::{
|
|||||||
metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME},
|
metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME},
|
||||||
LayeredRepository,
|
LayeredRepository,
|
||||||
},
|
},
|
||||||
repository::TimelineSyncStatusUpdate,
|
|
||||||
storage_sync::{self, index::RemoteIndex},
|
storage_sync::{self, index::RemoteIndex},
|
||||||
tenant_mgr::apply_timeline_sync_status_updates,
|
tenant_mgr::attach_downloaded_tenants,
|
||||||
thread_mgr,
|
thread_mgr,
|
||||||
thread_mgr::ThreadKind,
|
thread_mgr::ThreadKind,
|
||||||
};
|
};
|
||||||
|
|
||||||
use metrics::{
|
use metrics::{
|
||||||
register_histogram_vec, register_int_counter, register_int_gauge, HistogramVec, IntCounter,
|
register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge,
|
||||||
IntGauge,
|
HistogramVec, IntCounter, IntCounterVec, IntGauge,
|
||||||
};
|
};
|
||||||
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||||
|
|
||||||
pub use self::download::download_index_part;
|
use self::download::download_index_parts;
|
||||||
|
pub use self::download::gather_tenant_timelines_index_parts;
|
||||||
pub use self::download::TEMP_DOWNLOAD_EXTENSION;
|
pub use self::download::TEMP_DOWNLOAD_EXTENSION;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
@@ -208,14 +208,17 @@ lazy_static! {
|
|||||||
static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!(
|
static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!(
|
||||||
"pageserver_remote_storage_image_sync_seconds",
|
"pageserver_remote_storage_image_sync_seconds",
|
||||||
"Time took to synchronize (download or upload) a whole pageserver image. \
|
"Time took to synchronize (download or upload) a whole pageserver image. \
|
||||||
Grouped by `operation_kind` (upload|download) and `status` (success|failure)",
|
Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)",
|
||||||
&["operation_kind", "status"],
|
&["tenant_id", "timeline_id", "operation_kind", "status"],
|
||||||
vec![
|
vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
|
||||||
0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 6.0, 7.0,
|
|
||||||
8.0, 9.0, 10.0, 12.5, 15.0, 17.5, 20.0
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
.expect("failed to register pageserver image sync time histogram vec");
|
.expect("failed to register pageserver image sync time histogram vec");
|
||||||
|
static ref REMOTE_INDEX_UPLOAD: IntCounterVec = register_int_counter_vec!(
|
||||||
|
"pageserver_remote_storage_remote_index_uploads_total",
|
||||||
|
"Number of remote index uploads",
|
||||||
|
&["tenant_id", "timeline_id"],
|
||||||
|
)
|
||||||
|
.expect("failed to register pageserver remote index upload vec");
|
||||||
}
|
}
|
||||||
|
|
||||||
static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();
|
static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();
|
||||||
@@ -298,7 +301,7 @@ pub fn start_local_timeline_sync(
|
|||||||
}
|
}
|
||||||
Ok(SyncStartupData {
|
Ok(SyncStartupData {
|
||||||
local_timeline_init_statuses,
|
local_timeline_init_statuses,
|
||||||
remote_index: RemoteIndex::empty(),
|
remote_index: RemoteIndex::default(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -832,7 +835,7 @@ where
|
|||||||
.build()
|
.build()
|
||||||
.context("Failed to create storage sync runtime")?;
|
.context("Failed to create storage sync runtime")?;
|
||||||
|
|
||||||
let applicable_index_parts = runtime.block_on(try_fetch_index_parts(
|
let applicable_index_parts = runtime.block_on(download_index_parts(
|
||||||
conf,
|
conf,
|
||||||
&storage,
|
&storage,
|
||||||
local_timeline_files.keys().copied().collect(),
|
local_timeline_files.keys().copied().collect(),
|
||||||
@@ -915,16 +918,48 @@ fn storage_sync_loop<P, S>(
|
|||||||
});
|
});
|
||||||
|
|
||||||
match loop_step {
|
match loop_step {
|
||||||
ControlFlow::Continue(new_timeline_states) => {
|
ControlFlow::Continue(updated_tenants) => {
|
||||||
if new_timeline_states.is_empty() {
|
if updated_tenants.is_empty() {
|
||||||
debug!("Sync loop step completed, no new timeline states");
|
debug!("Sync loop step completed, no new tenant states");
|
||||||
} else {
|
} else {
|
||||||
info!(
|
info!(
|
||||||
"Sync loop step completed, {} new timeline state update(s)",
|
"Sync loop step completed, {} new tenant state update(s)",
|
||||||
new_timeline_states.len()
|
updated_tenants.len()
|
||||||
);
|
);
|
||||||
|
let mut sync_status_updates: HashMap<ZTenantId, HashSet<ZTimelineId>> =
|
||||||
|
HashMap::new();
|
||||||
|
let index_accessor = runtime.block_on(index.write());
|
||||||
|
for tenant_id in updated_tenants {
|
||||||
|
let tenant_entry = match index_accessor.tenant_entry(&tenant_id) {
|
||||||
|
Some(tenant_entry) => tenant_entry,
|
||||||
|
None => {
|
||||||
|
error!(
|
||||||
|
"cannot find tenant in remote index for timeline sync update"
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if tenant_entry.has_in_progress_downloads() {
|
||||||
|
info!("Tenant {tenant_id} has pending timeline downloads, skipping repository registration");
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
info!(
|
||||||
|
"Tenant {tenant_id} download completed. Picking to register in repository"
|
||||||
|
);
|
||||||
|
// Here we assume that if tenant has no in-progress downloads that
|
||||||
|
// means that it is the last completed timeline download that triggered
|
||||||
|
// sync status update. So we look at the index for available timelines
|
||||||
|
// and register them all at once in a repository for download
|
||||||
|
// to be submitted in a single operation to repository
|
||||||
|
// so it can apply them at once to internal timeline map.
|
||||||
|
sync_status_updates
|
||||||
|
.insert(tenant_id, tenant_entry.keys().copied().collect());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
drop(index_accessor);
|
||||||
// Batch timeline download registration to ensure that the external registration code won't block any running tasks before.
|
// Batch timeline download registration to ensure that the external registration code won't block any running tasks before.
|
||||||
apply_timeline_sync_status_updates(conf, &index, new_timeline_states);
|
attach_downloaded_tenants(conf, &index, sync_status_updates);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ControlFlow::Break(()) => {
|
ControlFlow::Break(()) => {
|
||||||
@@ -935,6 +970,14 @@ fn storage_sync_loop<P, S>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// needed to check whether the download happened
|
||||||
|
// more informative than just a bool
|
||||||
|
#[derive(Debug)]
|
||||||
|
enum DownloadMarker {
|
||||||
|
Downloaded,
|
||||||
|
Nothing,
|
||||||
|
}
|
||||||
|
|
||||||
async fn process_batches<P, S>(
|
async fn process_batches<P, S>(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
max_sync_errors: NonZeroU32,
|
max_sync_errors: NonZeroU32,
|
||||||
@@ -942,7 +985,7 @@ async fn process_batches<P, S>(
|
|||||||
index: &RemoteIndex,
|
index: &RemoteIndex,
|
||||||
batched_tasks: HashMap<ZTenantTimelineId, SyncTaskBatch>,
|
batched_tasks: HashMap<ZTenantTimelineId, SyncTaskBatch>,
|
||||||
sync_queue: &SyncQueue,
|
sync_queue: &SyncQueue,
|
||||||
) -> HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>>
|
) -> HashSet<ZTenantId>
|
||||||
where
|
where
|
||||||
P: Debug + Send + Sync + 'static,
|
P: Debug + Send + Sync + 'static,
|
||||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||||
@@ -967,22 +1010,19 @@ where
|
|||||||
})
|
})
|
||||||
.collect::<FuturesUnordered<_>>();
|
.collect::<FuturesUnordered<_>>();
|
||||||
|
|
||||||
let mut new_timeline_states: HashMap<
|
let mut downloaded_timelines = HashSet::new();
|
||||||
ZTenantId,
|
|
||||||
HashMap<ZTimelineId, TimelineSyncStatusUpdate>,
|
|
||||||
> = HashMap::new();
|
|
||||||
|
|
||||||
while let Some((sync_id, state_update)) = sync_results.next().await {
|
while let Some((sync_id, download_marker)) = sync_results.next().await {
|
||||||
debug!("Finished storage sync task for sync id {sync_id}");
|
debug!(
|
||||||
if let Some(state_update) = state_update {
|
"Finished storage sync task for sync id {sync_id} download marker {:?}",
|
||||||
new_timeline_states
|
download_marker
|
||||||
.entry(sync_id.tenant_id)
|
);
|
||||||
.or_default()
|
if matches!(download_marker, DownloadMarker::Downloaded) {
|
||||||
.insert(sync_id.timeline_id, state_update);
|
downloaded_timelines.insert(sync_id.tenant_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
new_timeline_states
|
downloaded_timelines
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn process_sync_task_batch<P, S>(
|
async fn process_sync_task_batch<P, S>(
|
||||||
@@ -991,7 +1031,7 @@ async fn process_sync_task_batch<P, S>(
|
|||||||
max_sync_errors: NonZeroU32,
|
max_sync_errors: NonZeroU32,
|
||||||
sync_id: ZTenantTimelineId,
|
sync_id: ZTenantTimelineId,
|
||||||
batch: SyncTaskBatch,
|
batch: SyncTaskBatch,
|
||||||
) -> Option<TimelineSyncStatusUpdate>
|
) -> DownloadMarker
|
||||||
where
|
where
|
||||||
P: Debug + Send + Sync + 'static,
|
P: Debug + Send + Sync + 'static,
|
||||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||||
@@ -1076,7 +1116,7 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None
|
DownloadMarker::Nothing
|
||||||
}
|
}
|
||||||
.instrument(info_span!("download_timeline_data")),
|
.instrument(info_span!("download_timeline_data")),
|
||||||
);
|
);
|
||||||
@@ -1130,7 +1170,7 @@ async fn download_timeline_data<P, S>(
|
|||||||
new_download_data: SyncData<LayersDownload>,
|
new_download_data: SyncData<LayersDownload>,
|
||||||
sync_start: Instant,
|
sync_start: Instant,
|
||||||
task_name: &str,
|
task_name: &str,
|
||||||
) -> Option<TimelineSyncStatusUpdate>
|
) -> DownloadMarker
|
||||||
where
|
where
|
||||||
P: Debug + Send + Sync + 'static,
|
P: Debug + Send + Sync + 'static,
|
||||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||||
@@ -1146,20 +1186,20 @@ where
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
DownloadedTimeline::Abort => {
|
DownloadedTimeline::Abort => {
|
||||||
register_sync_status(sync_start, task_name, None);
|
register_sync_status(sync_id, sync_start, task_name, None);
|
||||||
if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) {
|
if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) {
|
||||||
error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}");
|
error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
DownloadedTimeline::FailedAndRescheduled => {
|
DownloadedTimeline::FailedAndRescheduled => {
|
||||||
register_sync_status(sync_start, task_name, Some(false));
|
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||||
}
|
}
|
||||||
DownloadedTimeline::Successful(mut download_data) => {
|
DownloadedTimeline::Successful(mut download_data) => {
|
||||||
match update_local_metadata(conf, sync_id, current_remote_timeline).await {
|
match update_local_metadata(conf, sync_id, current_remote_timeline).await {
|
||||||
Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
|
Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
register_sync_status(sync_start, task_name, Some(true));
|
register_sync_status(sync_id, sync_start, task_name, Some(true));
|
||||||
return Some(TimelineSyncStatusUpdate::Downloaded);
|
return DownloadMarker::Downloaded;
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
|
error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
|
||||||
@@ -1169,13 +1209,13 @@ where
|
|||||||
error!("Failed to update local timeline metadata: {e:?}");
|
error!("Failed to update local timeline metadata: {e:?}");
|
||||||
download_data.retries += 1;
|
download_data.retries += 1;
|
||||||
sync_queue.push(sync_id, SyncTask::Download(download_data));
|
sync_queue.push(sync_id, SyncTask::Download(download_data));
|
||||||
register_sync_status(sync_start, task_name, Some(false));
|
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
None
|
DownloadMarker::Nothing
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn update_local_metadata(
|
async fn update_local_metadata(
|
||||||
@@ -1265,14 +1305,14 @@ async fn delete_timeline_data<P, S>(
|
|||||||
error!("Failed to update remote timeline {sync_id}: {e:?}");
|
error!("Failed to update remote timeline {sync_id}: {e:?}");
|
||||||
new_delete_data.retries += 1;
|
new_delete_data.retries += 1;
|
||||||
sync_queue.push(sync_id, SyncTask::Delete(new_delete_data));
|
sync_queue.push(sync_id, SyncTask::Delete(new_delete_data));
|
||||||
register_sync_status(sync_start, task_name, Some(false));
|
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
timeline_delete.deletion_registered = true;
|
timeline_delete.deletion_registered = true;
|
||||||
|
|
||||||
let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await;
|
let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await;
|
||||||
register_sync_status(sync_start, task_name, Some(sync_status));
|
register_sync_status(sync_id, sync_start, task_name, Some(sync_status));
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result<TimelineMetadata> {
|
async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result<TimelineMetadata> {
|
||||||
@@ -1306,7 +1346,7 @@ async fn upload_timeline_data<P, S>(
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
UploadedTimeline::FailedAndRescheduled => {
|
UploadedTimeline::FailedAndRescheduled => {
|
||||||
register_sync_status(sync_start, task_name, Some(false));
|
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
UploadedTimeline::Successful(upload_data) => upload_data,
|
UploadedTimeline::Successful(upload_data) => upload_data,
|
||||||
@@ -1325,13 +1365,13 @@ async fn upload_timeline_data<P, S>(
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
register_sync_status(sync_start, task_name, Some(true));
|
register_sync_status(sync_id, sync_start, task_name, Some(true));
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Failed to update remote timeline {sync_id}: {e:?}");
|
error!("Failed to update remote timeline {sync_id}: {e:?}");
|
||||||
uploaded_data.retries += 1;
|
uploaded_data.retries += 1;
|
||||||
sync_queue.push(sync_id, SyncTask::Upload(uploaded_data));
|
sync_queue.push(sync_id, SyncTask::Upload(uploaded_data));
|
||||||
register_sync_status(sync_start, task_name, Some(false));
|
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1421,7 +1461,14 @@ where
|
|||||||
IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline)
|
IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline)
|
||||||
.context("Failed to create an index part from the updated remote timeline")?;
|
.context("Failed to create an index part from the updated remote timeline")?;
|
||||||
|
|
||||||
info!("Uploading remote index for the timeline");
|
debug!("Uploading remote index for the timeline");
|
||||||
|
REMOTE_INDEX_UPLOAD
|
||||||
|
.with_label_values(&[
|
||||||
|
&sync_id.tenant_id.to_string(),
|
||||||
|
&sync_id.timeline_id.to_string(),
|
||||||
|
])
|
||||||
|
.inc();
|
||||||
|
|
||||||
upload_index_part(conf, storage, sync_id, new_index_part)
|
upload_index_part(conf, storage, sync_id, new_index_part)
|
||||||
.await
|
.await
|
||||||
.context("Failed to upload new index part")
|
.context("Failed to upload new index part")
|
||||||
@@ -1448,35 +1495,6 @@ async fn validate_task_retries<T>(
|
|||||||
ControlFlow::Continue(sync_data)
|
ControlFlow::Continue(sync_data)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn try_fetch_index_parts<P, S>(
|
|
||||||
conf: &'static PageServerConf,
|
|
||||||
storage: &S,
|
|
||||||
keys: HashSet<ZTenantTimelineId>,
|
|
||||||
) -> HashMap<ZTenantTimelineId, IndexPart>
|
|
||||||
where
|
|
||||||
P: Debug + Send + Sync + 'static,
|
|
||||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
|
||||||
{
|
|
||||||
let mut index_parts = HashMap::with_capacity(keys.len());
|
|
||||||
|
|
||||||
let mut part_downloads = keys
|
|
||||||
.into_iter()
|
|
||||||
.map(|id| async move { (id, download_index_part(conf, storage, id).await) })
|
|
||||||
.collect::<FuturesUnordered<_>>();
|
|
||||||
|
|
||||||
while let Some((id, part_upload_result)) = part_downloads.next().await {
|
|
||||||
match part_upload_result {
|
|
||||||
Ok(index_part) => {
|
|
||||||
debug!("Successfully fetched index part for {id}");
|
|
||||||
index_parts.insert(id, index_part);
|
|
||||||
}
|
|
||||||
Err(e) => warn!("Failed to fetch index part for {id}: {e}"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
index_parts
|
|
||||||
}
|
|
||||||
|
|
||||||
fn schedule_first_sync_tasks(
|
fn schedule_first_sync_tasks(
|
||||||
index: &mut RemoteTimelineIndex,
|
index: &mut RemoteTimelineIndex,
|
||||||
sync_queue: &SyncQueue,
|
sync_queue: &SyncQueue,
|
||||||
@@ -1590,12 +1608,24 @@ fn compare_local_and_remote_timeline(
|
|||||||
(initial_timeline_status, awaits_download)
|
(initial_timeline_status, awaits_download)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Option<bool>) {
|
fn register_sync_status(
|
||||||
|
sync_id: ZTenantTimelineId,
|
||||||
|
sync_start: Instant,
|
||||||
|
sync_name: &str,
|
||||||
|
sync_status: Option<bool>,
|
||||||
|
) {
|
||||||
let secs_elapsed = sync_start.elapsed().as_secs_f64();
|
let secs_elapsed = sync_start.elapsed().as_secs_f64();
|
||||||
info!("Processed a sync task in {secs_elapsed:.2} seconds");
|
debug!("Processed a sync task in {secs_elapsed:.2} seconds");
|
||||||
|
|
||||||
|
let tenant_id = sync_id.tenant_id.to_string();
|
||||||
|
let timeline_id = sync_id.timeline_id.to_string();
|
||||||
match sync_status {
|
match sync_status {
|
||||||
Some(true) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "success"]),
|
Some(true) => {
|
||||||
Some(false) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "failure"]),
|
IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "success"])
|
||||||
|
}
|
||||||
|
Some(false) => {
|
||||||
|
IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "failure"])
|
||||||
|
}
|
||||||
None => return,
|
None => return,
|
||||||
}
|
}
|
||||||
.observe(secs_elapsed)
|
.observe(secs_elapsed)
|
||||||
|
|||||||
@@ -1,10 +1,14 @@
|
|||||||
//! Timeline synchronization logic to fetch the layer files from remote storage into pageserver's local directory.
|
//! Timeline synchronization logic to fetch the layer files from remote storage into pageserver's local directory.
|
||||||
|
|
||||||
use std::{collections::HashSet, fmt::Debug, path::Path};
|
use std::{
|
||||||
|
collections::{HashMap, HashSet},
|
||||||
|
fmt::Debug,
|
||||||
|
path::Path,
|
||||||
|
};
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use futures::stream::{FuturesUnordered, StreamExt};
|
use futures::stream::{FuturesUnordered, StreamExt};
|
||||||
use remote_storage::{path_with_suffix_extension, RemoteStorage};
|
use remote_storage::{path_with_suffix_extension, RemoteObjectName, RemoteStorage};
|
||||||
use tokio::{
|
use tokio::{
|
||||||
fs,
|
fs,
|
||||||
io::{self, AsyncWriteExt},
|
io::{self, AsyncWriteExt},
|
||||||
@@ -14,7 +18,7 @@ use tracing::{debug, error, info, warn};
|
|||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
|
config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
|
||||||
};
|
};
|
||||||
use utils::zid::ZTenantTimelineId;
|
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
index::{IndexPart, RemoteTimeline},
|
index::{IndexPart, RemoteTimeline},
|
||||||
@@ -23,8 +27,108 @@ use super::{
|
|||||||
|
|
||||||
pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||||
|
|
||||||
|
/// FIXME: Needs cleanup. Currently it swallows errors. Here we need to ensure that
|
||||||
|
/// we successfully downloaded all metadata parts for one tenant.
|
||||||
|
/// And successful includes absence of index_part in the remote. Because it is valid situation
|
||||||
|
/// when timeline was just created and pageserver restarted before upload of index part was completed.
|
||||||
|
/// But currently RemoteStorage interface does not provide this knowledge because it uses
|
||||||
|
/// anyhow::Error as an error type. So this needs a refactoring.
|
||||||
|
///
|
||||||
|
/// In other words we need to yield only complete sets of tenant timelines.
|
||||||
|
/// Failure for one timeline of a tenant should exclude whole tenant from returned hashmap.
|
||||||
|
/// So there are two requirements: keep everything in one futures unordered
|
||||||
|
/// to allow higher concurrency. Mark tenants as failed independently.
|
||||||
|
/// That requires some bookeeping.
|
||||||
|
pub async fn download_index_parts<P, S>(
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
storage: &S,
|
||||||
|
keys: HashSet<ZTenantTimelineId>,
|
||||||
|
) -> HashMap<ZTenantId, HashMap<ZTimelineId, IndexPart>>
|
||||||
|
where
|
||||||
|
P: Debug + Send + Sync + 'static,
|
||||||
|
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||||
|
{
|
||||||
|
let mut index_parts: HashMap<ZTenantId, HashMap<ZTimelineId, IndexPart>> = HashMap::new();
|
||||||
|
|
||||||
|
let mut part_downloads = keys
|
||||||
|
.into_iter()
|
||||||
|
.map(|id| async move { (id, download_index_part(conf, storage, id).await) })
|
||||||
|
.collect::<FuturesUnordered<_>>();
|
||||||
|
|
||||||
|
while let Some((id, part_upload_result)) = part_downloads.next().await {
|
||||||
|
match part_upload_result {
|
||||||
|
Ok(index_part) => {
|
||||||
|
debug!("Successfully fetched index part for {id}");
|
||||||
|
index_parts
|
||||||
|
.entry(id.tenant_id)
|
||||||
|
.or_default()
|
||||||
|
.insert(id.timeline_id, index_part);
|
||||||
|
}
|
||||||
|
Err(e) => error!("Failed to fetch index part for {id}: {e}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
index_parts
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Note: The function is rather expensive from s3 access point of view, it will execute ceil(N/1000) + N requests.
|
||||||
|
/// At least one request to obtain a list of tenant timelines (more requests is there are more than 1000 timelines).
|
||||||
|
/// And then will attempt to download all index files that belong to these timelines.
|
||||||
|
pub async fn gather_tenant_timelines_index_parts<P, S>(
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
storage: &S,
|
||||||
|
tenant_id: ZTenantId,
|
||||||
|
) -> anyhow::Result<HashMap<ZTimelineId, IndexPart>>
|
||||||
|
where
|
||||||
|
P: RemoteObjectName + Debug + Send + Sync + 'static,
|
||||||
|
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||||
|
{
|
||||||
|
let tenant_path = conf.timelines_path(&tenant_id);
|
||||||
|
let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to get tenant storage path for local path '{}'",
|
||||||
|
tenant_path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
let timelines = storage
|
||||||
|
.list_prefixes(Some(tenant_storage_path))
|
||||||
|
.await
|
||||||
|
.with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to list tenant storage path to get remote timelines to download: {}",
|
||||||
|
tenant_id
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut sync_ids = HashSet::new();
|
||||||
|
|
||||||
|
for timeline_remote_storage_key in timelines {
|
||||||
|
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
||||||
|
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let timeline_id: ZTimelineId = object_name
|
||||||
|
.parse()
|
||||||
|
.with_context(|| {
|
||||||
|
format!("failed to parse object name into timeline id for tenant {tenant_id} '{object_name}'")
|
||||||
|
})?;
|
||||||
|
|
||||||
|
sync_ids.insert(ZTenantTimelineId {
|
||||||
|
tenant_id,
|
||||||
|
timeline_id,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
download_index_parts(conf, storage, sync_ids)
|
||||||
|
.await
|
||||||
|
.remove(&tenant_id)
|
||||||
|
.ok_or(anyhow::anyhow!(
|
||||||
|
"Missing tenant index parts. This is a bug."
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
/// Retrieves index data from the remote storage for a given timeline.
|
/// Retrieves index data from the remote storage for a given timeline.
|
||||||
pub async fn download_index_part<P, S>(
|
async fn download_index_part<P, S>(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
storage: &S,
|
storage: &S,
|
||||||
sync_id: ZTenantTimelineId,
|
sync_id: ZTenantTimelineId,
|
||||||
@@ -44,13 +148,23 @@ where
|
|||||||
index_part_path.display()
|
index_part_path.display()
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
|
let mut index_part_download =
|
||||||
|
storage
|
||||||
|
.download(&part_storage_path)
|
||||||
|
.await
|
||||||
|
.with_context(|| {
|
||||||
|
format!("Failed to open download stream for for storage path {part_storage_path:?}")
|
||||||
|
})?;
|
||||||
let mut index_part_bytes = Vec::new();
|
let mut index_part_bytes = Vec::new();
|
||||||
storage
|
io::copy(
|
||||||
.download(&part_storage_path, &mut index_part_bytes)
|
&mut index_part_download.download_stream,
|
||||||
.await
|
&mut index_part_bytes,
|
||||||
.with_context(|| {
|
)
|
||||||
format!("Failed to download an index part from storage path {part_storage_path:?}")
|
.await
|
||||||
})?;
|
.with_context(|| {
|
||||||
|
format!("Failed to download an index part from storage path {part_storage_path:?}")
|
||||||
|
})?;
|
||||||
|
|
||||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| {
|
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| {
|
||||||
format!("Failed to deserialize index part file from storage path '{part_storage_path:?}'")
|
format!("Failed to deserialize index part file from storage path '{part_storage_path:?}'")
|
||||||
@@ -162,15 +276,19 @@ where
|
|||||||
temp_file_path.display()
|
temp_file_path.display()
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
|
let mut download = storage
|
||||||
storage
|
.download(&layer_storage_path)
|
||||||
.download(&layer_storage_path, &mut destination_file)
|
|
||||||
.await
|
.await
|
||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
"Failed to download a layer from storage path '{layer_storage_path:?}'"
|
"Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
|
io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
|
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
|
||||||
//! remote timeline layers and its metadata.
|
//! remote timeline layers and its metadata.
|
||||||
|
|
||||||
|
use std::ops::{Deref, DerefMut};
|
||||||
use std::{
|
use std::{
|
||||||
collections::{HashMap, HashSet},
|
collections::{HashMap, HashSet},
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
@@ -14,7 +15,10 @@ use serde_with::{serde_as, DisplayFromStr};
|
|||||||
use tokio::sync::RwLock;
|
use tokio::sync::RwLock;
|
||||||
|
|
||||||
use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata};
|
use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata};
|
||||||
use utils::{lsn::Lsn, zid::ZTenantTimelineId};
|
use utils::{
|
||||||
|
lsn::Lsn,
|
||||||
|
zid::{ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||||
|
};
|
||||||
|
|
||||||
/// A part of the filesystem path, that needs a root to become a path again.
|
/// A part of the filesystem path, that needs a root to become a path again.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||||
@@ -41,38 +45,68 @@ impl RelativePath {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
pub struct TenantEntry(HashMap<ZTimelineId, RemoteTimeline>);
|
||||||
|
|
||||||
|
impl TenantEntry {
|
||||||
|
pub fn has_in_progress_downloads(&self) -> bool {
|
||||||
|
self.values()
|
||||||
|
.any(|remote_timeline| remote_timeline.awaits_download)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Deref for TenantEntry {
|
||||||
|
type Target = HashMap<ZTimelineId, RemoteTimeline>;
|
||||||
|
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
&self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DerefMut for TenantEntry {
|
||||||
|
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||||
|
&mut self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<HashMap<ZTimelineId, RemoteTimeline>> for TenantEntry {
|
||||||
|
fn from(inner: HashMap<ZTimelineId, RemoteTimeline>) -> Self {
|
||||||
|
Self(inner)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// An index to track tenant files that exist on the remote storage.
|
/// An index to track tenant files that exist on the remote storage.
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone, Default)]
|
||||||
pub struct RemoteTimelineIndex {
|
pub struct RemoteTimelineIndex {
|
||||||
timeline_entries: HashMap<ZTenantTimelineId, RemoteTimeline>,
|
entries: HashMap<ZTenantId, TenantEntry>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`].
|
/// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`].
|
||||||
|
#[derive(Default)]
|
||||||
pub struct RemoteIndex(Arc<RwLock<RemoteTimelineIndex>>);
|
pub struct RemoteIndex(Arc<RwLock<RemoteTimelineIndex>>);
|
||||||
|
|
||||||
impl RemoteIndex {
|
impl RemoteIndex {
|
||||||
pub fn empty() -> Self {
|
|
||||||
Self(Arc::new(RwLock::new(RemoteTimelineIndex {
|
|
||||||
timeline_entries: HashMap::new(),
|
|
||||||
})))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn from_parts(
|
pub fn from_parts(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
index_parts: HashMap<ZTenantTimelineId, IndexPart>,
|
index_parts: HashMap<ZTenantId, HashMap<ZTimelineId, IndexPart>>,
|
||||||
) -> anyhow::Result<Self> {
|
) -> anyhow::Result<Self> {
|
||||||
let mut timeline_entries = HashMap::new();
|
let mut entries: HashMap<ZTenantId, TenantEntry> = HashMap::new();
|
||||||
|
|
||||||
for (sync_id, index_part) in index_parts {
|
for (tenant_id, timelines) in index_parts {
|
||||||
let timeline_path = conf.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
|
for (timeline_id, index_part) in timelines {
|
||||||
let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part)
|
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||||
.context("Failed to restore remote timeline data from index part")?;
|
let remote_timeline =
|
||||||
timeline_entries.insert(sync_id, remote_timeline);
|
RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||||
|
.context("Failed to restore remote timeline data from index part")?;
|
||||||
|
|
||||||
|
entries
|
||||||
|
.entry(tenant_id)
|
||||||
|
.or_default()
|
||||||
|
.insert(timeline_id, remote_timeline);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex {
|
Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex { entries }))))
|
||||||
timeline_entries,
|
|
||||||
}))))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> {
|
pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> {
|
||||||
@@ -91,20 +125,67 @@ impl Clone for RemoteIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl RemoteTimelineIndex {
|
impl RemoteTimelineIndex {
|
||||||
pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&RemoteTimeline> {
|
pub fn timeline_entry(
|
||||||
self.timeline_entries.get(id)
|
&self,
|
||||||
|
ZTenantTimelineId {
|
||||||
|
tenant_id,
|
||||||
|
timeline_id,
|
||||||
|
}: &ZTenantTimelineId,
|
||||||
|
) -> Option<&RemoteTimeline> {
|
||||||
|
self.entries.get(tenant_id)?.get(timeline_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timeline_entry_mut(&mut self, id: &ZTenantTimelineId) -> Option<&mut RemoteTimeline> {
|
pub fn timeline_entry_mut(
|
||||||
self.timeline_entries.get_mut(id)
|
&mut self,
|
||||||
|
ZTenantTimelineId {
|
||||||
|
tenant_id,
|
||||||
|
timeline_id,
|
||||||
|
}: &ZTenantTimelineId,
|
||||||
|
) -> Option<&mut RemoteTimeline> {
|
||||||
|
self.entries.get_mut(tenant_id)?.get_mut(timeline_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: RemoteTimeline) {
|
pub fn add_timeline_entry(
|
||||||
self.timeline_entries.insert(id, entry);
|
&mut self,
|
||||||
|
ZTenantTimelineId {
|
||||||
|
tenant_id,
|
||||||
|
timeline_id,
|
||||||
|
}: ZTenantTimelineId,
|
||||||
|
entry: RemoteTimeline,
|
||||||
|
) {
|
||||||
|
self.entries
|
||||||
|
.entry(tenant_id)
|
||||||
|
.or_default()
|
||||||
|
.insert(timeline_id, entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn all_sync_ids(&self) -> impl Iterator<Item = ZTenantTimelineId> + '_ {
|
pub fn remove_timeline_entry(
|
||||||
self.timeline_entries.keys().copied()
|
&mut self,
|
||||||
|
ZTenantTimelineId {
|
||||||
|
tenant_id,
|
||||||
|
timeline_id,
|
||||||
|
}: ZTenantTimelineId,
|
||||||
|
) -> Option<RemoteTimeline> {
|
||||||
|
self.entries
|
||||||
|
.entry(tenant_id)
|
||||||
|
.or_default()
|
||||||
|
.remove(&timeline_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn tenant_entry(&self, tenant_id: &ZTenantId) -> Option<&TenantEntry> {
|
||||||
|
self.entries.get(tenant_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn tenant_entry_mut(&mut self, tenant_id: &ZTenantId) -> Option<&mut TenantEntry> {
|
||||||
|
self.entries.get_mut(tenant_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_tenant_entry(&mut self, tenant_id: ZTenantId) -> &mut TenantEntry {
|
||||||
|
self.entries.entry(tenant_id).or_default()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn remove_tenant_entry(&mut self, tenant_id: &ZTenantId) -> Option<TenantEntry> {
|
||||||
|
self.entries.remove(tenant_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_awaits_download(
|
pub fn set_awaits_download(
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ use std::{fmt::Debug, path::PathBuf};
|
|||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use futures::stream::{FuturesUnordered, StreamExt};
|
use futures::stream::{FuturesUnordered, StreamExt};
|
||||||
|
use lazy_static::lazy_static;
|
||||||
use remote_storage::RemoteStorage;
|
use remote_storage::RemoteStorage;
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
use tracing::{debug, error, info, warn};
|
use tracing::{debug, error, info, warn};
|
||||||
@@ -17,6 +18,16 @@ use super::{
|
|||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
|
config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
|
||||||
};
|
};
|
||||||
|
use metrics::{register_int_counter_vec, IntCounterVec};
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref NO_LAYERS_UPLOAD: IntCounterVec = register_int_counter_vec!(
|
||||||
|
"pageserver_remote_storage_no_layers_uploads_total",
|
||||||
|
"Number of skipped uploads due to no layers",
|
||||||
|
&["tenant_id", "timeline_id"],
|
||||||
|
)
|
||||||
|
.expect("failed to register pageserver no layers upload vec");
|
||||||
|
}
|
||||||
|
|
||||||
/// Serializes and uploads the given index part data to the remote storage.
|
/// Serializes and uploads the given index part data to the remote storage.
|
||||||
pub(super) async fn upload_index_part<P, S>(
|
pub(super) async fn upload_index_part<P, S>(
|
||||||
@@ -102,7 +113,13 @@ where
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
if layers_to_upload.is_empty() {
|
if layers_to_upload.is_empty() {
|
||||||
info!("No layers to upload after filtering, aborting");
|
debug!("No layers to upload after filtering, aborting");
|
||||||
|
NO_LAYERS_UPLOAD
|
||||||
|
.with_label_values(&[
|
||||||
|
&sync_id.tenant_id.to_string(),
|
||||||
|
&sync_id.timeline_id.to_string(),
|
||||||
|
])
|
||||||
|
.inc();
|
||||||
return UploadedTimeline::Successful(upload_data);
|
return UploadedTimeline::Successful(upload_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ pub mod defaults {
|
|||||||
pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
|
pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
|
||||||
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
|
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
|
||||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
||||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10_000;
|
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Per-tenant configuration options
|
/// Per-tenant configuration options
|
||||||
|
|||||||
@@ -4,8 +4,8 @@
|
|||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::layered_repository::{load_metadata, LayeredRepository};
|
use crate::layered_repository::{load_metadata, LayeredRepository};
|
||||||
use crate::pgdatadir_mapping::DatadirTimeline;
|
use crate::pgdatadir_mapping::DatadirTimeline;
|
||||||
use crate::repository::{Repository, TimelineSyncStatusUpdate};
|
use crate::repository::Repository;
|
||||||
use crate::storage_sync::index::RemoteIndex;
|
use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex};
|
||||||
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
|
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
|
||||||
use crate::tenant_config::TenantConfOpt;
|
use crate::tenant_config::TenantConfOpt;
|
||||||
use crate::thread_mgr::ThreadKind;
|
use crate::thread_mgr::ThreadKind;
|
||||||
@@ -13,11 +13,11 @@ use crate::timelines::CreateRepo;
|
|||||||
use crate::walredo::PostgresRedoManager;
|
use crate::walredo::PostgresRedoManager;
|
||||||
use crate::{thread_mgr, timelines, walreceiver};
|
use crate::{thread_mgr, timelines, walreceiver};
|
||||||
use crate::{DatadirTimelineImpl, RepositoryImpl};
|
use crate::{DatadirTimelineImpl, RepositoryImpl};
|
||||||
use anyhow::{bail, Context};
|
use anyhow::Context;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_with::{serde_as, DisplayFromStr};
|
use serde_with::{serde_as, DisplayFromStr};
|
||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
use std::collections::HashMap;
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tokio::sync::mpsc;
|
use tokio::sync::mpsc;
|
||||||
@@ -157,7 +157,13 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIn
|
|||||||
// loading a tenant is serious, but it's better to complete the startup and
|
// loading a tenant is serious, but it's better to complete the startup and
|
||||||
// serve other tenants, than fail completely.
|
// serve other tenants, than fail completely.
|
||||||
error!("Failed to initialize local tenant {tenant_id}: {:?}", err);
|
error!("Failed to initialize local tenant {tenant_id}: {:?}", err);
|
||||||
set_tenant_state(tenant_id, TenantState::Broken)?;
|
|
||||||
|
if let Err(err) = set_tenant_state(tenant_id, TenantState::Broken) {
|
||||||
|
error!(
|
||||||
|
"Failed to set tenant state to broken {tenant_id}: {:?}",
|
||||||
|
err
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -165,44 +171,51 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIn
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub enum LocalTimelineUpdate {
|
pub enum LocalTimelineUpdate {
|
||||||
Detach(ZTenantTimelineId),
|
Detach {
|
||||||
Attach(ZTenantTimelineId, Arc<DatadirTimelineImpl>),
|
id: ZTenantTimelineId,
|
||||||
|
// used to signal to the detach caller that walreceiver successfully terminated for specified id
|
||||||
|
join_confirmation_sender: std::sync::mpsc::Sender<()>,
|
||||||
|
},
|
||||||
|
Attach {
|
||||||
|
id: ZTenantTimelineId,
|
||||||
|
datadir: Arc<DatadirTimelineImpl>,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for LocalTimelineUpdate {
|
impl std::fmt::Debug for LocalTimelineUpdate {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
Self::Detach(ttid) => f.debug_tuple("Remove").field(ttid).finish(),
|
Self::Detach { id, .. } => f.debug_tuple("Remove").field(id).finish(),
|
||||||
Self::Attach(ttid, _) => f.debug_tuple("Add").field(ttid).finish(),
|
Self::Attach { id, .. } => f.debug_tuple("Add").field(id).finish(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Updates tenants' repositories, changing their timelines state in memory.
|
/// Updates tenants' repositories, changing their timelines state in memory.
|
||||||
pub fn apply_timeline_sync_status_updates(
|
pub fn attach_downloaded_tenants(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
remote_index: &RemoteIndex,
|
remote_index: &RemoteIndex,
|
||||||
sync_status_updates: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>>,
|
sync_status_updates: HashMap<ZTenantId, HashSet<ZTimelineId>>,
|
||||||
) {
|
) {
|
||||||
if sync_status_updates.is_empty() {
|
if sync_status_updates.is_empty() {
|
||||||
debug!("no sync status updates to apply");
|
debug!("No sync status updates to apply");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
info!(
|
for (tenant_id, downloaded_timelines) in sync_status_updates {
|
||||||
"Applying sync status updates for {} timelines",
|
info!(
|
||||||
sync_status_updates.len()
|
"Registering downlloaded timelines for {tenant_id} {} timelines",
|
||||||
);
|
downloaded_timelines.len()
|
||||||
debug!("Sync status updates: {sync_status_updates:?}");
|
);
|
||||||
|
debug!("Downloaded timelines: {downloaded_timelines:?}");
|
||||||
|
|
||||||
for (tenant_id, status_updates) in sync_status_updates {
|
|
||||||
let repo = match load_local_repo(conf, tenant_id, remote_index) {
|
let repo = match load_local_repo(conf, tenant_id, remote_index) {
|
||||||
Ok(repo) => repo,
|
Ok(repo) => repo,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Failed to load repo for tenant {tenant_id} Error: {e:?}",);
|
error!("Failed to load repo for tenant {tenant_id} Error: {e:?}");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
match apply_timeline_remote_sync_status_updates(&repo, status_updates) {
|
match attach_downloaded_tenant(&repo, downloaded_timelines) {
|
||||||
Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"),
|
Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"),
|
||||||
Err(e) => error!(
|
Err(e) => error!(
|
||||||
"Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}"
|
"Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}"
|
||||||
@@ -230,8 +243,6 @@ pub fn shutdown_all_tenants() {
|
|||||||
drop(m);
|
drop(m);
|
||||||
|
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None);
|
thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None);
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None);
|
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None);
|
|
||||||
|
|
||||||
// Ok, no background threads running anymore. Flush any remaining data in
|
// Ok, no background threads running anymore. Flush any remaining data in
|
||||||
// memory to disk.
|
// memory to disk.
|
||||||
@@ -330,44 +341,12 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
|
|||||||
}
|
}
|
||||||
(TenantState::Idle, TenantState::Active) => {
|
(TenantState::Idle, TenantState::Active) => {
|
||||||
info!("activating tenant {tenant_id}");
|
info!("activating tenant {tenant_id}");
|
||||||
let compactor_spawn_result = thread_mgr::spawn(
|
|
||||||
ThreadKind::Compactor,
|
|
||||||
Some(tenant_id),
|
|
||||||
None,
|
|
||||||
"Compactor thread",
|
|
||||||
false,
|
|
||||||
move || crate::tenant_threads::compact_loop(tenant_id),
|
|
||||||
);
|
|
||||||
if compactor_spawn_result.is_err() {
|
|
||||||
let mut m = tenants_state::write_tenants();
|
|
||||||
m.get_mut(&tenant_id)
|
|
||||||
.with_context(|| format!("Tenant not found for id {tenant_id}"))?
|
|
||||||
.state = old_state;
|
|
||||||
drop(m);
|
|
||||||
}
|
|
||||||
compactor_spawn_result?;
|
|
||||||
|
|
||||||
let gc_spawn_result = thread_mgr::spawn(
|
// Spawn gc and compaction loops. The loops will shut themselves
|
||||||
ThreadKind::GarbageCollector,
|
// down when they notice that the tenant is inactive.
|
||||||
Some(tenant_id),
|
// TODO maybe use tokio::sync::watch instead?
|
||||||
None,
|
crate::tenant_tasks::start_compaction_loop(tenant_id)?;
|
||||||
"GC thread",
|
crate::tenant_tasks::start_gc_loop(tenant_id)?;
|
||||||
false,
|
|
||||||
move || crate::tenant_threads::gc_loop(tenant_id),
|
|
||||||
)
|
|
||||||
.map(|_thread_id| ()) // update the `Result::Ok` type to match the outer function's return signature
|
|
||||||
.with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}"));
|
|
||||||
|
|
||||||
if let Err(e) = &gc_spawn_result {
|
|
||||||
let mut m = tenants_state::write_tenants();
|
|
||||||
m.get_mut(&tenant_id)
|
|
||||||
.with_context(|| format!("Tenant not found for id {tenant_id}"))?
|
|
||||||
.state = old_state;
|
|
||||||
drop(m);
|
|
||||||
error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}");
|
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
|
|
||||||
return gc_spawn_result;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
(TenantState::Idle, TenantState::Stopping) => {
|
(TenantState::Idle, TenantState::Stopping) => {
|
||||||
info!("stopping idle tenant {tenant_id}");
|
info!("stopping idle tenant {tenant_id}");
|
||||||
@@ -379,8 +358,10 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
|
|||||||
Some(tenant_id),
|
Some(tenant_id),
|
||||||
None,
|
None,
|
||||||
);
|
);
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), Some(tenant_id), None);
|
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
|
// Wait until all gc/compaction tasks finish
|
||||||
|
let repo = get_repository_for_tenant(tenant_id)?;
|
||||||
|
let _guard = repo.file_lock.write().unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -419,33 +400,86 @@ pub fn get_local_timeline_with_load(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn detach_timeline(
|
pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
||||||
conf: &'static PageServerConf,
|
// Start with the shutdown of timeline tasks (this shuts down the walreceiver)
|
||||||
tenant_id: ZTenantId,
|
// It is important that we do not take locks here, and do not check whether the timeline exists
|
||||||
timeline_id: ZTimelineId,
|
// because if we hold tenants_state::write_tenants() while awaiting for the threads to join
|
||||||
) -> anyhow::Result<()> {
|
// we cannot create new timelines and tenants, and that can take quite some time,
|
||||||
// shutdown the timeline threads (this shuts down the walreceiver)
|
// it can even become stuck due to a bug making whole pageserver unavailable for some operations
|
||||||
thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id));
|
// so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation
|
||||||
|
// and then try to actually remove timeline from inmemory state and this is the point when concurrent requests
|
||||||
|
// will synchronize and either fail with the not found error or succeed
|
||||||
|
|
||||||
|
let (sender, receiver) = std::sync::mpsc::channel::<()>();
|
||||||
|
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach {
|
||||||
|
id: ZTenantTimelineId::new(tenant_id, timeline_id),
|
||||||
|
join_confirmation_sender: sender,
|
||||||
|
});
|
||||||
|
|
||||||
|
debug!("waiting for wal receiver to shutdown");
|
||||||
|
let _ = receiver.recv();
|
||||||
|
debug!("wal receiver shutdown confirmed");
|
||||||
|
debug!("waiting for threads to shutdown");
|
||||||
|
thread_mgr::shutdown_threads(None, None, Some(timeline_id));
|
||||||
|
debug!("thread shutdown completed");
|
||||||
match tenants_state::write_tenants().get_mut(&tenant_id) {
|
match tenants_state::write_tenants().get_mut(&tenant_id) {
|
||||||
Some(tenant) => {
|
Some(tenant) => {
|
||||||
tenant
|
tenant.repo.delete_timeline(timeline_id)?;
|
||||||
.repo
|
|
||||||
.detach_timeline(timeline_id)
|
|
||||||
.context("Failed to detach inmem tenant timeline")?;
|
|
||||||
tenant.local_timelines.remove(&timeline_id);
|
tenant.local_timelines.remove(&timeline_id);
|
||||||
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach(
|
|
||||||
ZTenantTimelineId::new(tenant_id, timeline_id),
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
None => bail!("Tenant {tenant_id} not found in local tenant state"),
|
None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"),
|
||||||
}
|
}
|
||||||
|
|
||||||
let local_timeline_directory = conf.timeline_path(&timeline_id, &tenant_id);
|
Ok(())
|
||||||
std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
|
}
|
||||||
|
|
||||||
|
pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> anyhow::Result<()> {
|
||||||
|
set_tenant_state(tenant_id, TenantState::Stopping)?;
|
||||||
|
// shutdown the tenant and timeline threads: gc, compaction, page service threads)
|
||||||
|
thread_mgr::shutdown_threads(None, Some(tenant_id), None);
|
||||||
|
|
||||||
|
// FIXME should we protect somehow from starting new threads/walreceivers when tenant is in stopping state?
|
||||||
|
// send stop signal to wal receiver and collect join handles while holding the lock
|
||||||
|
let walreceiver_join_handles = {
|
||||||
|
let tenants = tenants_state::write_tenants();
|
||||||
|
let tenant = tenants.get(&tenant_id).context("tenant not found")?;
|
||||||
|
let mut walreceiver_join_handles = Vec::with_capacity(tenant.local_timelines.len());
|
||||||
|
for timeline_id in tenant.local_timelines.keys() {
|
||||||
|
let (sender, receiver) = std::sync::mpsc::channel::<()>();
|
||||||
|
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach {
|
||||||
|
id: ZTenantTimelineId::new(tenant_id, *timeline_id),
|
||||||
|
join_confirmation_sender: sender,
|
||||||
|
});
|
||||||
|
walreceiver_join_handles.push((*timeline_id, receiver));
|
||||||
|
}
|
||||||
|
// drop the tenants lock
|
||||||
|
walreceiver_join_handles
|
||||||
|
};
|
||||||
|
|
||||||
|
// wait for wal receivers to stop without holding the lock, because walreceiver
|
||||||
|
// will attempt to change tenant state which is protected by the same global tenants lock.
|
||||||
|
// TODO do we need a timeout here? how to handle it?
|
||||||
|
// recv_timeout is broken: https://github.com/rust-lang/rust/issues/94518#issuecomment-1057440631
|
||||||
|
// need to use crossbeam-channel
|
||||||
|
for (timeline_id, join_handle) in walreceiver_join_handles {
|
||||||
|
info!("waiting for wal receiver to shutdown timeline_id {timeline_id}");
|
||||||
|
join_handle.recv().context("failed to join walreceiver")?;
|
||||||
|
info!("wal receiver shutdown confirmed timeline_id {timeline_id}");
|
||||||
|
}
|
||||||
|
|
||||||
|
tenants_state::write_tenants().remove(&tenant_id);
|
||||||
|
|
||||||
|
// If removal fails there will be no way to successfully retry detach,
|
||||||
|
// because tenant no longer exists in in memory map. And it needs to be removed from it
|
||||||
|
// before we remove files because it contains references to repository
|
||||||
|
// which references ephemeral files which are deleted on drop. So if we keep these references
|
||||||
|
// code will attempt to remove files which no longer exist. This can be fixed by having shutdown
|
||||||
|
// mechanism for repository that will clean temporary data to avoid any references to ephemeral files
|
||||||
|
let local_tenant_directory = conf.tenant_path(&tenant_id);
|
||||||
|
std::fs::remove_dir_all(&local_tenant_directory).with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
"Failed to remove local timeline directory '{}'",
|
"Failed to remove local timeline directory '{}'",
|
||||||
local_timeline_directory.display()
|
local_tenant_directory.display()
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
@@ -466,10 +500,10 @@ fn load_local_timeline(
|
|||||||
));
|
));
|
||||||
page_tline.init_logical_size()?;
|
page_tline.init_logical_size()?;
|
||||||
|
|
||||||
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach(
|
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach {
|
||||||
ZTenantTimelineId::new(repo.tenant_id(), timeline_id),
|
id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id),
|
||||||
Arc::clone(&page_tline),
|
datadir: Arc::clone(&page_tline),
|
||||||
));
|
});
|
||||||
|
|
||||||
Ok(page_tline)
|
Ok(page_tline)
|
||||||
}
|
}
|
||||||
@@ -479,15 +513,27 @@ fn load_local_timeline(
|
|||||||
pub struct TenantInfo {
|
pub struct TenantInfo {
|
||||||
#[serde_as(as = "DisplayFromStr")]
|
#[serde_as(as = "DisplayFromStr")]
|
||||||
pub id: ZTenantId,
|
pub id: ZTenantId,
|
||||||
pub state: TenantState,
|
pub state: Option<TenantState>,
|
||||||
|
pub has_in_progress_downloads: Option<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn list_tenants() -> Vec<TenantInfo> {
|
pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec<TenantInfo> {
|
||||||
tenants_state::read_tenants()
|
tenants_state::read_tenants()
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(id, tenant)| TenantInfo {
|
.map(|(id, tenant)| {
|
||||||
id: *id,
|
let has_in_progress_downloads = remote_index
|
||||||
state: tenant.state,
|
.tenant_entry(id)
|
||||||
|
.map(|entry| entry.has_in_progress_downloads());
|
||||||
|
|
||||||
|
if has_in_progress_downloads.is_none() {
|
||||||
|
error!("timeline is not found in remote index while it is present in the tenants registry")
|
||||||
|
}
|
||||||
|
|
||||||
|
TenantInfo {
|
||||||
|
id: *id,
|
||||||
|
state: Some(tenant.state),
|
||||||
|
has_in_progress_downloads,
|
||||||
|
}
|
||||||
})
|
})
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
@@ -499,74 +545,73 @@ pub fn list_tenants() -> Vec<TenantInfo> {
|
|||||||
/// A timeline is categorized as broken when any of following conditions is true:
|
/// A timeline is categorized as broken when any of following conditions is true:
|
||||||
/// - failed to load the timeline's metadata
|
/// - failed to load the timeline's metadata
|
||||||
/// - the timeline's disk consistent LSN is zero
|
/// - the timeline's disk consistent LSN is zero
|
||||||
fn check_broken_timeline(repo: &LayeredRepository, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
fn check_broken_timeline(
|
||||||
let metadata = load_metadata(repo.conf, timeline_id, repo.tenant_id())
|
conf: &'static PageServerConf,
|
||||||
.context("failed to load metadata")?;
|
tenant_id: ZTenantId,
|
||||||
|
timeline_id: ZTimelineId,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let metadata =
|
||||||
|
load_metadata(conf, timeline_id, tenant_id).context("failed to load metadata")?;
|
||||||
|
|
||||||
// A timeline with zero disk consistent LSN can happen when the page server
|
// A timeline with zero disk consistent LSN can happen when the page server
|
||||||
// failed to checkpoint the timeline import data when creating that timeline.
|
// failed to checkpoint the timeline import data when creating that timeline.
|
||||||
if metadata.disk_consistent_lsn() == Lsn::INVALID {
|
if metadata.disk_consistent_lsn() == Lsn::INVALID {
|
||||||
bail!("Timeline {timeline_id} has a zero disk consistent LSN.");
|
anyhow::bail!("Timeline {timeline_id} has a zero disk consistent LSN.");
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Note: all timelines are attached at once if and only if all of them are locally complete
|
||||||
fn init_local_repository(
|
fn init_local_repository(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_id: ZTenantId,
|
tenant_id: ZTenantId,
|
||||||
local_timeline_init_statuses: HashMap<ZTimelineId, LocalTimelineInitStatus>,
|
local_timeline_init_statuses: HashMap<ZTimelineId, LocalTimelineInitStatus>,
|
||||||
remote_index: &RemoteIndex,
|
remote_index: &RemoteIndex,
|
||||||
) -> anyhow::Result<(), anyhow::Error> {
|
) -> anyhow::Result<(), anyhow::Error> {
|
||||||
// initialize local tenant
|
let mut timelines_to_attach = HashSet::new();
|
||||||
let repo = load_local_repo(conf, tenant_id, remote_index)
|
|
||||||
.with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?;
|
|
||||||
|
|
||||||
let mut status_updates = HashMap::with_capacity(local_timeline_init_statuses.len());
|
|
||||||
for (timeline_id, init_status) in local_timeline_init_statuses {
|
for (timeline_id, init_status) in local_timeline_init_statuses {
|
||||||
match init_status {
|
match init_status {
|
||||||
LocalTimelineInitStatus::LocallyComplete => {
|
LocalTimelineInitStatus::LocallyComplete => {
|
||||||
debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository");
|
debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository");
|
||||||
if let Err(err) = check_broken_timeline(&repo, timeline_id) {
|
check_broken_timeline(conf, tenant_id, timeline_id)
|
||||||
info!(
|
.context("found broken timeline")?;
|
||||||
"Found a broken timeline {timeline_id} (err={err:?}), skip registering it in repository"
|
timelines_to_attach.insert(timeline_id);
|
||||||
);
|
|
||||||
} else {
|
|
||||||
status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
LocalTimelineInitStatus::NeedsSync => {
|
LocalTimelineInitStatus::NeedsSync => {
|
||||||
debug!(
|
debug!(
|
||||||
"timeline {tenant_id} for tenant {timeline_id} needs sync, \
|
"timeline {tenant_id} for tenant {timeline_id} needs sync, \
|
||||||
so skipped for adding into repository until sync is finished"
|
so skipped for adding into repository until sync is finished"
|
||||||
);
|
);
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// initialize local tenant
|
||||||
|
let repo = load_local_repo(conf, tenant_id, remote_index)
|
||||||
|
.with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?;
|
||||||
|
|
||||||
// Lets fail here loudly to be on the safe side.
|
// Lets fail here loudly to be on the safe side.
|
||||||
// XXX: It may be a better api to actually distinguish between repository startup
|
// XXX: It may be a better api to actually distinguish between repository startup
|
||||||
// and processing of newly downloaded timelines.
|
// and processing of newly downloaded timelines.
|
||||||
apply_timeline_remote_sync_status_updates(&repo, status_updates)
|
attach_downloaded_tenant(&repo, timelines_to_attach)
|
||||||
.with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?;
|
.with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn apply_timeline_remote_sync_status_updates(
|
fn attach_downloaded_tenant(
|
||||||
repo: &LayeredRepository,
|
repo: &LayeredRepository,
|
||||||
status_updates: HashMap<ZTimelineId, TimelineSyncStatusUpdate>,
|
downloaded_timelines: HashSet<ZTimelineId>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let mut registration_queue = Vec::with_capacity(status_updates.len());
|
let mut registration_queue = Vec::with_capacity(downloaded_timelines.len());
|
||||||
|
|
||||||
// first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration
|
// first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration
|
||||||
for (timeline_id, status_update) in status_updates {
|
for timeline_id in downloaded_timelines {
|
||||||
repo.apply_timeline_remote_sync_status_update(timeline_id, status_update)
|
repo.attach_timeline(timeline_id).with_context(|| {
|
||||||
.with_context(|| {
|
format!("Failed to load timeline {timeline_id} into in-memory repository")
|
||||||
format!("Failed to load timeline {timeline_id} into in-memory repository")
|
})?;
|
||||||
})?;
|
registration_queue.push(timeline_id);
|
||||||
match status_update {
|
|
||||||
TimelineSyncStatusUpdate::Downloaded => registration_queue.push(timeline_id),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for timeline_id in registration_queue {
|
for timeline_id in registration_queue {
|
||||||
@@ -574,7 +619,7 @@ fn apply_timeline_remote_sync_status_updates(
|
|||||||
match tenants_state::write_tenants().get_mut(&tenant_id) {
|
match tenants_state::write_tenants().get_mut(&tenant_id) {
|
||||||
Some(tenant) => match tenant.local_timelines.entry(timeline_id) {
|
Some(tenant) => match tenant.local_timelines.entry(timeline_id) {
|
||||||
Entry::Occupied(_) => {
|
Entry::Occupied(_) => {
|
||||||
bail!("Local timeline {timeline_id} already registered")
|
anyhow::bail!("Local timeline {timeline_id} already registered")
|
||||||
}
|
}
|
||||||
Entry::Vacant(v) => {
|
Entry::Vacant(v) => {
|
||||||
v.insert(load_local_timeline(repo, timeline_id).with_context(|| {
|
v.insert(load_local_timeline(repo, timeline_id).with_context(|| {
|
||||||
@@ -582,7 +627,7 @@ fn apply_timeline_remote_sync_status_updates(
|
|||||||
})?);
|
})?);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
None => bail!(
|
None => anyhow::bail!(
|
||||||
"Tenant {} not found in local tenant state",
|
"Tenant {} not found in local tenant state",
|
||||||
repo.tenant_id()
|
repo.tenant_id()
|
||||||
),
|
),
|
||||||
|
|||||||
286
pageserver/src/tenant_tasks.rs
Normal file
286
pageserver/src/tenant_tasks.rs
Normal file
@@ -0,0 +1,286 @@
|
|||||||
|
//! This module contains functions to serve per-tenant background processes,
|
||||||
|
//! such as compaction and GC
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::ops::ControlFlow;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use crate::repository::Repository;
|
||||||
|
use crate::tenant_mgr::TenantState;
|
||||||
|
use crate::thread_mgr::ThreadKind;
|
||||||
|
use crate::{tenant_mgr, thread_mgr};
|
||||||
|
use anyhow::{self, Context};
|
||||||
|
use futures::stream::FuturesUnordered;
|
||||||
|
use futures::StreamExt;
|
||||||
|
use metrics::{register_int_counter_vec, IntCounterVec};
|
||||||
|
use once_cell::sync::{Lazy, OnceCell};
|
||||||
|
use tokio::sync::mpsc;
|
||||||
|
use tokio::sync::watch;
|
||||||
|
use tracing::*;
|
||||||
|
use utils::zid::ZTenantId;
|
||||||
|
|
||||||
|
static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
|
register_int_counter_vec!(
|
||||||
|
"pageserver_tenant_task_events",
|
||||||
|
"Number of task start/stop/fail events.",
|
||||||
|
&["event"],
|
||||||
|
)
|
||||||
|
.expect("Failed to register tenant_task_events metric")
|
||||||
|
});
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Compaction task's main loop
|
||||||
|
///
|
||||||
|
async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
|
||||||
|
loop {
|
||||||
|
trace!("waking up");
|
||||||
|
|
||||||
|
// Run blocking part of the task
|
||||||
|
let period: Result<Result<_, anyhow::Error>, _> = tokio::task::spawn_blocking(move || {
|
||||||
|
// Break if tenant is not active
|
||||||
|
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||||
|
return Ok(ControlFlow::Break(()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Break if we're not allowed to write to disk
|
||||||
|
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||||
|
// TODO do this inside repo.compaction_iteration instead.
|
||||||
|
let _guard = match repo.file_lock.try_read() {
|
||||||
|
Ok(g) => g,
|
||||||
|
Err(_) => return Ok(ControlFlow::Break(())),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Run compaction
|
||||||
|
let compaction_period = repo.get_compaction_period();
|
||||||
|
repo.compaction_iteration()?;
|
||||||
|
Ok(ControlFlow::Continue(compaction_period))
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Decide whether to sleep or break
|
||||||
|
let sleep_duration = match period {
|
||||||
|
Ok(Ok(ControlFlow::Continue(period))) => period,
|
||||||
|
Ok(Ok(ControlFlow::Break(()))) => break,
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
error!("Compaction failed, retrying: {}", e);
|
||||||
|
Duration::from_secs(2)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Compaction join error, retrying: {}", e);
|
||||||
|
Duration::from_secs(2)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Sleep
|
||||||
|
tokio::select! {
|
||||||
|
_ = cancel.changed() => {
|
||||||
|
trace!("received cancellation request");
|
||||||
|
break;
|
||||||
|
},
|
||||||
|
_ = tokio::time::sleep(sleep_duration) => {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
trace!(
|
||||||
|
"compaction loop stopped. State is {:?}",
|
||||||
|
tenant_mgr::get_tenant_state(tenantid)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static START_GC_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
|
||||||
|
static START_COMPACTION_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
|
||||||
|
|
||||||
|
/// Spawn a task that will periodically schedule garbage collection until
|
||||||
|
/// the tenant becomes inactive. This should be called on tenant
|
||||||
|
/// activation.
|
||||||
|
pub fn start_gc_loop(tenantid: ZTenantId) -> anyhow::Result<()> {
|
||||||
|
START_GC_LOOP
|
||||||
|
.get()
|
||||||
|
.context("Failed to get START_GC_LOOP")?
|
||||||
|
.blocking_send(tenantid)
|
||||||
|
.context("Failed to send to START_GC_LOOP channel")?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Spawn a task that will periodically schedule compaction until
|
||||||
|
/// the tenant becomes inactive. This should be called on tenant
|
||||||
|
/// activation.
|
||||||
|
pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> {
|
||||||
|
START_COMPACTION_LOOP
|
||||||
|
.get()
|
||||||
|
.context("failed to get START_COMPACTION_LOOP")?
|
||||||
|
.blocking_send(tenantid)
|
||||||
|
.context("failed to send to START_COMPACTION_LOOP")?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Spawn the TenantTaskManager
|
||||||
|
/// This needs to be called before start_gc_loop or start_compaction_loop
|
||||||
|
pub fn init_tenant_task_pool() -> anyhow::Result<()> {
|
||||||
|
let runtime = tokio::runtime::Builder::new_multi_thread()
|
||||||
|
.thread_name("tenant-task-worker")
|
||||||
|
.enable_all()
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
let (gc_send, mut gc_recv) = mpsc::channel::<ZTenantId>(100);
|
||||||
|
START_GC_LOOP
|
||||||
|
.set(gc_send)
|
||||||
|
.expect("Failed to set START_GC_LOOP");
|
||||||
|
|
||||||
|
let (compaction_send, mut compaction_recv) = mpsc::channel::<ZTenantId>(100);
|
||||||
|
START_COMPACTION_LOOP
|
||||||
|
.set(compaction_send)
|
||||||
|
.expect("Failed to set START_COMPACTION_LOOP");
|
||||||
|
|
||||||
|
// TODO this is getting repetitive
|
||||||
|
let mut gc_loops = HashMap::<ZTenantId, watch::Sender<()>>::new();
|
||||||
|
let mut compaction_loops = HashMap::<ZTenantId, watch::Sender<()>>::new();
|
||||||
|
|
||||||
|
thread_mgr::spawn(
|
||||||
|
ThreadKind::TenantTaskManager,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
"Tenant task manager main thread",
|
||||||
|
true,
|
||||||
|
move || {
|
||||||
|
runtime.block_on(async move {
|
||||||
|
let mut futures = FuturesUnordered::new();
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
_ = thread_mgr::shutdown_watcher() => {
|
||||||
|
// Send cancellation to all tasks
|
||||||
|
for (_, cancel) in gc_loops.drain() {
|
||||||
|
cancel.send(()).ok();
|
||||||
|
}
|
||||||
|
for (_, cancel) in compaction_loops.drain() {
|
||||||
|
cancel.send(()).ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Exit after all tasks finish
|
||||||
|
while let Some(result) = futures.next().await {
|
||||||
|
match result {
|
||||||
|
Ok(()) => {
|
||||||
|
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc();
|
||||||
|
error!("loop join error {}", e)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
},
|
||||||
|
tenantid = gc_recv.recv() => {
|
||||||
|
let tenantid = tenantid.expect("Gc task channel closed unexpectedly");
|
||||||
|
|
||||||
|
// Spawn new task, request cancellation of the old one if exists
|
||||||
|
let (cancel_send, cancel_recv) = watch::channel(());
|
||||||
|
let handle = tokio::spawn(gc_loop(tenantid, cancel_recv)
|
||||||
|
.instrument(info_span!("gc loop", tenant = %tenantid)));
|
||||||
|
if let Some(old_cancel_send) = gc_loops.insert(tenantid, cancel_send) {
|
||||||
|
old_cancel_send.send(()).ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update metrics, remember handle
|
||||||
|
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||||
|
futures.push(handle);
|
||||||
|
},
|
||||||
|
tenantid = compaction_recv.recv() => {
|
||||||
|
let tenantid = tenantid.expect("Compaction task channel closed unexpectedly");
|
||||||
|
|
||||||
|
// Spawn new task, request cancellation of the old one if exists
|
||||||
|
let (cancel_send, cancel_recv) = watch::channel(());
|
||||||
|
let handle = tokio::spawn(compaction_loop(tenantid, cancel_recv)
|
||||||
|
.instrument(info_span!("compaction loop", tenant = %tenantid)));
|
||||||
|
if let Some(old_cancel_send) = compaction_loops.insert(tenantid, cancel_send) {
|
||||||
|
old_cancel_send.send(()).ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update metrics, remember handle
|
||||||
|
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||||
|
futures.push(handle);
|
||||||
|
},
|
||||||
|
result = futures.next() => {
|
||||||
|
// Log and count any unhandled panics
|
||||||
|
match result {
|
||||||
|
Some(Ok(())) => {
|
||||||
|
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||||
|
},
|
||||||
|
Some(Err(e)) => {
|
||||||
|
TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc();
|
||||||
|
error!("loop join error {}", e)
|
||||||
|
},
|
||||||
|
None => {},
|
||||||
|
};
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// GC task's main loop
|
||||||
|
///
|
||||||
|
async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
|
||||||
|
loop {
|
||||||
|
trace!("waking up");
|
||||||
|
|
||||||
|
// Run blocking part of the task
|
||||||
|
let period: Result<Result<_, anyhow::Error>, _> = tokio::task::spawn_blocking(move || {
|
||||||
|
// Break if tenant is not active
|
||||||
|
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||||
|
return Ok(ControlFlow::Break(()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Break if we're not allowed to write to disk
|
||||||
|
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||||
|
// TODO do this inside repo.gc_iteration instead.
|
||||||
|
let _guard = match repo.file_lock.try_read() {
|
||||||
|
Ok(g) => g,
|
||||||
|
Err(_) => return Ok(ControlFlow::Break(())),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Run gc
|
||||||
|
let gc_period = repo.get_gc_period();
|
||||||
|
let gc_horizon = repo.get_gc_horizon();
|
||||||
|
if gc_horizon > 0 {
|
||||||
|
repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(ControlFlow::Continue(gc_period))
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Decide whether to sleep or break
|
||||||
|
let sleep_duration = match period {
|
||||||
|
Ok(Ok(ControlFlow::Continue(period))) => period,
|
||||||
|
Ok(Ok(ControlFlow::Break(()))) => break,
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
error!("Gc failed, retrying: {}", e);
|
||||||
|
Duration::from_secs(2)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Gc join error, retrying: {}", e);
|
||||||
|
Duration::from_secs(2)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Sleep
|
||||||
|
tokio::select! {
|
||||||
|
_ = cancel.changed() => {
|
||||||
|
trace!("received cancellation request");
|
||||||
|
break;
|
||||||
|
},
|
||||||
|
_ = tokio::time::sleep(sleep_duration) => {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
trace!(
|
||||||
|
"GC loop stopped. State is {:?}",
|
||||||
|
tenant_mgr::get_tenant_state(tenantid)
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -1,79 +0,0 @@
|
|||||||
//! This module contains functions to serve per-tenant background processes,
|
|
||||||
//! such as compaction and GC
|
|
||||||
use crate::repository::Repository;
|
|
||||||
use crate::tenant_mgr;
|
|
||||||
use crate::tenant_mgr::TenantState;
|
|
||||||
use anyhow::Result;
|
|
||||||
use std::time::Duration;
|
|
||||||
use tracing::*;
|
|
||||||
use utils::zid::ZTenantId;
|
|
||||||
|
|
||||||
///
|
|
||||||
/// Compaction thread's main loop
|
|
||||||
///
|
|
||||||
pub fn compact_loop(tenantid: ZTenantId) -> Result<()> {
|
|
||||||
if let Err(err) = compact_loop_ext(tenantid) {
|
|
||||||
error!("compact loop terminated with error: {:?}", err);
|
|
||||||
Err(err)
|
|
||||||
} else {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn compact_loop_ext(tenantid: ZTenantId) -> Result<()> {
|
|
||||||
loop {
|
|
||||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
|
||||||
let compaction_period = repo.get_compaction_period();
|
|
||||||
|
|
||||||
std::thread::sleep(compaction_period);
|
|
||||||
trace!("compaction thread for tenant {} waking up", tenantid);
|
|
||||||
|
|
||||||
// Compact timelines
|
|
||||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
|
||||||
repo.compaction_iteration()?;
|
|
||||||
}
|
|
||||||
|
|
||||||
trace!(
|
|
||||||
"compaction thread stopped for tenant {} state is {:?}",
|
|
||||||
tenantid,
|
|
||||||
tenant_mgr::get_tenant_state(tenantid)
|
|
||||||
);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
/// GC thread's main loop
|
|
||||||
///
|
|
||||||
pub fn gc_loop(tenantid: ZTenantId) -> Result<()> {
|
|
||||||
loop {
|
|
||||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
trace!("gc thread for tenant {} waking up", tenantid);
|
|
||||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
|
||||||
let gc_horizon = repo.get_gc_horizon();
|
|
||||||
// Garbage collect old files that are not needed for PITR anymore
|
|
||||||
if gc_horizon > 0 {
|
|
||||||
repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO Write it in more adequate way using
|
|
||||||
// condvar.wait_timeout() or something
|
|
||||||
let mut sleep_time = repo.get_gc_period().as_secs();
|
|
||||||
while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == Some(TenantState::Active)
|
|
||||||
{
|
|
||||||
sleep_time -= 1;
|
|
||||||
std::thread::sleep(Duration::from_secs(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
trace!(
|
|
||||||
"GC thread stopped for tenant {} state is {:?}",
|
|
||||||
tenantid,
|
|
||||||
tenant_mgr::get_tenant_state(tenantid)
|
|
||||||
);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -94,11 +94,8 @@ pub enum ThreadKind {
|
|||||||
// Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL.
|
// Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL.
|
||||||
WalReceiverManager,
|
WalReceiverManager,
|
||||||
|
|
||||||
// Thread that handles compaction of all timelines for a tenant.
|
// Thread that schedules new compaction and gc jobs
|
||||||
Compactor,
|
TenantTaskManager,
|
||||||
|
|
||||||
// Thread that handles GC of a tenant
|
|
||||||
GarbageCollector,
|
|
||||||
|
|
||||||
// Thread that flushes frozen in-memory layers to disk
|
// Thread that flushes frozen in-memory layers to disk
|
||||||
LayerFlushThread,
|
LayerFlushThread,
|
||||||
@@ -108,15 +105,21 @@ pub enum ThreadKind {
|
|||||||
StorageSync,
|
StorageSync,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct MutableThreadState {
|
||||||
|
/// Tenant and timeline that this thread is associated with.
|
||||||
|
tenant_id: Option<ZTenantId>,
|
||||||
|
timeline_id: Option<ZTimelineId>,
|
||||||
|
|
||||||
|
/// Handle for waiting for the thread to exit. It can be None, if the
|
||||||
|
/// the thread has already exited.
|
||||||
|
join_handle: Option<JoinHandle<()>>,
|
||||||
|
}
|
||||||
|
|
||||||
struct PageServerThread {
|
struct PageServerThread {
|
||||||
_thread_id: u64,
|
_thread_id: u64,
|
||||||
|
|
||||||
kind: ThreadKind,
|
kind: ThreadKind,
|
||||||
|
|
||||||
/// Tenant and timeline that this thread is associated with.
|
|
||||||
tenant_id: Option<ZTenantId>,
|
|
||||||
timeline_id: Option<ZTimelineId>,
|
|
||||||
|
|
||||||
name: String,
|
name: String,
|
||||||
|
|
||||||
// To request thread shutdown, set the flag, and send a dummy message to the
|
// To request thread shutdown, set the flag, and send a dummy message to the
|
||||||
@@ -124,9 +127,7 @@ struct PageServerThread {
|
|||||||
shutdown_requested: AtomicBool,
|
shutdown_requested: AtomicBool,
|
||||||
shutdown_tx: watch::Sender<()>,
|
shutdown_tx: watch::Sender<()>,
|
||||||
|
|
||||||
/// Handle for waiting for the thread to exit. It can be None, if the
|
mutable: Mutex<MutableThreadState>,
|
||||||
/// the thread has already exited.
|
|
||||||
join_handle: Mutex<Option<JoinHandle<()>>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Launch a new thread
|
/// Launch a new thread
|
||||||
@@ -145,29 +146,27 @@ where
|
|||||||
{
|
{
|
||||||
let (shutdown_tx, shutdown_rx) = watch::channel(());
|
let (shutdown_tx, shutdown_rx) = watch::channel(());
|
||||||
let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed);
|
let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed);
|
||||||
let thread = PageServerThread {
|
let thread = Arc::new(PageServerThread {
|
||||||
_thread_id: thread_id,
|
_thread_id: thread_id,
|
||||||
kind,
|
kind,
|
||||||
tenant_id,
|
|
||||||
timeline_id,
|
|
||||||
name: name.to_string(),
|
name: name.to_string(),
|
||||||
|
|
||||||
shutdown_requested: AtomicBool::new(false),
|
shutdown_requested: AtomicBool::new(false),
|
||||||
shutdown_tx,
|
shutdown_tx,
|
||||||
|
mutable: Mutex::new(MutableThreadState {
|
||||||
join_handle: Mutex::new(None),
|
tenant_id,
|
||||||
};
|
timeline_id,
|
||||||
|
join_handle: None,
|
||||||
let thread_rc = Arc::new(thread);
|
}),
|
||||||
|
});
|
||||||
let mut jh_guard = thread_rc.join_handle.lock().unwrap();
|
|
||||||
|
|
||||||
THREADS
|
THREADS
|
||||||
.lock()
|
.lock()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.insert(thread_id, Arc::clone(&thread_rc));
|
.insert(thread_id, Arc::clone(&thread));
|
||||||
|
|
||||||
let thread_rc2 = Arc::clone(&thread_rc);
|
let mut thread_mut = thread.mutable.lock().unwrap();
|
||||||
|
|
||||||
|
let thread_cloned = Arc::clone(&thread);
|
||||||
let thread_name = name.to_string();
|
let thread_name = name.to_string();
|
||||||
let join_handle = match thread::Builder::new()
|
let join_handle = match thread::Builder::new()
|
||||||
.name(name.to_string())
|
.name(name.to_string())
|
||||||
@@ -175,7 +174,7 @@ where
|
|||||||
thread_wrapper(
|
thread_wrapper(
|
||||||
thread_name,
|
thread_name,
|
||||||
thread_id,
|
thread_id,
|
||||||
thread_rc2,
|
thread_cloned,
|
||||||
shutdown_rx,
|
shutdown_rx,
|
||||||
shutdown_process_on_error,
|
shutdown_process_on_error,
|
||||||
f,
|
f,
|
||||||
@@ -189,8 +188,8 @@ where
|
|||||||
return Err(err);
|
return Err(err);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
*jh_guard = Some(join_handle);
|
thread_mut.join_handle = Some(join_handle);
|
||||||
drop(jh_guard);
|
drop(thread_mut);
|
||||||
|
|
||||||
// The thread is now running. Nothing more to do here
|
// The thread is now running. Nothing more to do here
|
||||||
Ok(thread_id)
|
Ok(thread_id)
|
||||||
@@ -229,19 +228,20 @@ fn thread_wrapper<F>(
|
|||||||
.remove(&thread_id)
|
.remove(&thread_id)
|
||||||
.expect("no thread in registry");
|
.expect("no thread in registry");
|
||||||
|
|
||||||
|
let thread_mut = thread.mutable.lock().unwrap();
|
||||||
match result {
|
match result {
|
||||||
Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name),
|
Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name),
|
||||||
Ok(Err(err)) => {
|
Ok(Err(err)) => {
|
||||||
if shutdown_process_on_error {
|
if shutdown_process_on_error {
|
||||||
error!(
|
error!(
|
||||||
"Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
"Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||||
thread_name, thread.tenant_id, thread.timeline_id, err
|
thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err
|
||||||
);
|
);
|
||||||
shutdown_pageserver(1);
|
shutdown_pageserver(1);
|
||||||
} else {
|
} else {
|
||||||
error!(
|
error!(
|
||||||
"Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
"Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||||
thread_name, thread.tenant_id, thread.timeline_id, err
|
thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -249,19 +249,29 @@ fn thread_wrapper<F>(
|
|||||||
if shutdown_process_on_error {
|
if shutdown_process_on_error {
|
||||||
error!(
|
error!(
|
||||||
"Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
"Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||||
thread_name, thread.tenant_id, thread.timeline_id, err
|
thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err
|
||||||
);
|
);
|
||||||
shutdown_pageserver(1);
|
shutdown_pageserver(1);
|
||||||
} else {
|
} else {
|
||||||
error!(
|
error!(
|
||||||
"Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
"Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||||
thread_name, thread.tenant_id, thread.timeline_id, err
|
thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// expected to be called from the thread of the given id.
|
||||||
|
pub fn associate_with(tenant_id: Option<ZTenantId>, timeline_id: Option<ZTimelineId>) {
|
||||||
|
CURRENT_THREAD.with(|ct| {
|
||||||
|
let borrowed = ct.borrow();
|
||||||
|
let mut thread_mut = borrowed.as_ref().unwrap().mutable.lock().unwrap();
|
||||||
|
thread_mut.tenant_id = tenant_id;
|
||||||
|
thread_mut.timeline_id = timeline_id;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/// Is there a thread running that matches the criteria
|
/// Is there a thread running that matches the criteria
|
||||||
|
|
||||||
/// Signal and wait for threads to shut down.
|
/// Signal and wait for threads to shut down.
|
||||||
@@ -285,9 +295,10 @@ pub fn shutdown_threads(
|
|||||||
|
|
||||||
let threads = THREADS.lock().unwrap();
|
let threads = THREADS.lock().unwrap();
|
||||||
for thread in threads.values() {
|
for thread in threads.values() {
|
||||||
|
let thread_mut = thread.mutable.lock().unwrap();
|
||||||
if (kind.is_none() || Some(thread.kind) == kind)
|
if (kind.is_none() || Some(thread.kind) == kind)
|
||||||
&& (tenant_id.is_none() || thread.tenant_id == tenant_id)
|
&& (tenant_id.is_none() || thread_mut.tenant_id == tenant_id)
|
||||||
&& (timeline_id.is_none() || thread.timeline_id == timeline_id)
|
&& (timeline_id.is_none() || thread_mut.timeline_id == timeline_id)
|
||||||
{
|
{
|
||||||
thread.shutdown_requested.store(true, Ordering::Relaxed);
|
thread.shutdown_requested.store(true, Ordering::Relaxed);
|
||||||
// FIXME: handle error?
|
// FIXME: handle error?
|
||||||
@@ -298,8 +309,10 @@ pub fn shutdown_threads(
|
|||||||
drop(threads);
|
drop(threads);
|
||||||
|
|
||||||
for thread in victim_threads {
|
for thread in victim_threads {
|
||||||
|
let mut thread_mut = thread.mutable.lock().unwrap();
|
||||||
info!("waiting for {} to shut down", thread.name);
|
info!("waiting for {} to shut down", thread.name);
|
||||||
if let Some(join_handle) = thread.join_handle.lock().unwrap().take() {
|
if let Some(join_handle) = thread_mut.join_handle.take() {
|
||||||
|
drop(thread_mut);
|
||||||
let _ = join_handle.join();
|
let _ = join_handle.join();
|
||||||
} else {
|
} else {
|
||||||
// The thread had not even fully started yet. Or it was shut down
|
// The thread had not even fully started yet. Or it was shut down
|
||||||
|
|||||||
@@ -202,7 +202,7 @@ pub fn create_repo(
|
|||||||
// anymore, but I think that could still happen.
|
// anymore, but I think that could still happen.
|
||||||
let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {});
|
let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {});
|
||||||
|
|
||||||
(wal_redo_manager as _, RemoteIndex::empty())
|
(wal_redo_manager as _, RemoteIndex::default())
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -347,7 +347,7 @@ pub(crate) fn create_timeline(
|
|||||||
tenant_id: ZTenantId,
|
tenant_id: ZTenantId,
|
||||||
new_timeline_id: Option<ZTimelineId>,
|
new_timeline_id: Option<ZTimelineId>,
|
||||||
ancestor_timeline_id: Option<ZTimelineId>,
|
ancestor_timeline_id: Option<ZTimelineId>,
|
||||||
ancestor_start_lsn: Option<Lsn>,
|
mut ancestor_start_lsn: Option<Lsn>,
|
||||||
) -> Result<Option<TimelineInfo>> {
|
) -> Result<Option<TimelineInfo>> {
|
||||||
let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate);
|
let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate);
|
||||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||||
@@ -357,41 +357,35 @@ pub(crate) fn create_timeline(
|
|||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0));
|
|
||||||
|
|
||||||
let new_timeline_info = match ancestor_timeline_id {
|
let new_timeline_info = match ancestor_timeline_id {
|
||||||
Some(ancestor_timeline_id) => {
|
Some(ancestor_timeline_id) => {
|
||||||
let ancestor_timeline = repo
|
let ancestor_timeline = repo
|
||||||
.get_timeline_load(ancestor_timeline_id)
|
.get_timeline_load(ancestor_timeline_id)
|
||||||
.context("Cannot branch off the timeline that's not present locally")?;
|
.context("Cannot branch off the timeline that's not present locally")?;
|
||||||
|
|
||||||
if start_lsn == Lsn(0) {
|
if let Some(lsn) = ancestor_start_lsn.as_mut() {
|
||||||
// Find end of WAL on the old timeline
|
|
||||||
let end_of_wal = ancestor_timeline.get_last_record_lsn();
|
|
||||||
info!("branching at end of WAL: {}", end_of_wal);
|
|
||||||
start_lsn = end_of_wal;
|
|
||||||
} else {
|
|
||||||
// Wait for the WAL to arrive and be processed on the parent branch up
|
// Wait for the WAL to arrive and be processed on the parent branch up
|
||||||
// to the requested branch point. The repository code itself doesn't
|
// to the requested branch point. The repository code itself doesn't
|
||||||
// require it, but if we start to receive WAL on the new timeline,
|
// require it, but if we start to receive WAL on the new timeline,
|
||||||
// decoding the new WAL might need to look up previous pages, relation
|
// decoding the new WAL might need to look up previous pages, relation
|
||||||
// sizes etc. and that would get confused if the previous page versions
|
// sizes etc. and that would get confused if the previous page versions
|
||||||
// are not in the repository yet.
|
// are not in the repository yet.
|
||||||
ancestor_timeline.wait_lsn(start_lsn)?;
|
*lsn = lsn.align();
|
||||||
}
|
ancestor_timeline.wait_lsn(*lsn)?;
|
||||||
start_lsn = start_lsn.align();
|
|
||||||
|
|
||||||
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
|
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
|
||||||
if ancestor_ancestor_lsn > start_lsn {
|
if ancestor_ancestor_lsn > *lsn {
|
||||||
// can we safely just branch from the ancestor instead?
|
// can we safely just branch from the ancestor instead?
|
||||||
anyhow::bail!(
|
anyhow::bail!(
|
||||||
"invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
|
"invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
|
||||||
start_lsn,
|
lsn,
|
||||||
ancestor_timeline_id,
|
ancestor_timeline_id,
|
||||||
ancestor_ancestor_lsn,
|
ancestor_ancestor_lsn,
|
||||||
);
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?;
|
|
||||||
|
repo.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?;
|
||||||
// load the timeline into memory
|
// load the timeline into memory
|
||||||
let loaded_timeline =
|
let loaded_timeline =
|
||||||
tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?;
|
tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?;
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
1221
pageserver/src/walreceiver/connection_manager.rs
Normal file
1221
pageserver/src/walreceiver/connection_manager.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
|||||||
//! Actual Postgres connection handler to stream WAL to the server.
|
//! Actual Postgres connection handler to stream WAL to the server.
|
||||||
//! Runs as a separate, cancellable Tokio task.
|
|
||||||
use std::{
|
use std::{
|
||||||
str::FromStr,
|
str::FromStr,
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
@@ -10,113 +10,29 @@ use anyhow::{bail, ensure, Context};
|
|||||||
use bytes::BytesMut;
|
use bytes::BytesMut;
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
use postgres::{SimpleQueryMessage, SimpleQueryRow};
|
use postgres::{SimpleQueryMessage, SimpleQueryRow};
|
||||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
|
||||||
use postgres_protocol::message::backend::ReplicationMessage;
|
use postgres_protocol::message::backend::ReplicationMessage;
|
||||||
use postgres_types::PgLsn;
|
use postgres_types::PgLsn;
|
||||||
use tokio::{pin, select, sync::watch, time};
|
use tokio::{pin, select, sync::watch, time};
|
||||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||||
use tokio_stream::StreamExt;
|
use tokio_stream::StreamExt;
|
||||||
use tracing::{debug, error, info, info_span, trace, warn, Instrument};
|
use tracing::{debug, error, info, info_span, trace, warn, Instrument};
|
||||||
use utils::{
|
|
||||||
lsn::Lsn,
|
|
||||||
pq_proto::ZenithFeedback,
|
|
||||||
zid::{NodeId, ZTenantTimelineId},
|
|
||||||
};
|
|
||||||
|
|
||||||
|
use super::TaskEvent;
|
||||||
use crate::{
|
use crate::{
|
||||||
http::models::WalReceiverEntry,
|
http::models::WalReceiverEntry,
|
||||||
repository::{Repository, Timeline},
|
repository::{Repository, Timeline},
|
||||||
tenant_mgr,
|
tenant_mgr,
|
||||||
walingest::WalIngest,
|
walingest::WalIngest,
|
||||||
};
|
};
|
||||||
|
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||||
|
use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId};
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
/// Opens a conneciton to the given wal producer and streams the WAL, sending progress messages during streaming.
|
||||||
pub enum WalConnectionEvent {
|
pub async fn handle_walreceiver_connection(
|
||||||
Started,
|
|
||||||
NewWal(ZenithFeedback),
|
|
||||||
End(Result<(), String>),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A wrapper around standalone Tokio task, to poll its updates or cancel the task.
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct WalReceiverConnection {
|
|
||||||
handle: tokio::task::JoinHandle<()>,
|
|
||||||
cancellation: watch::Sender<()>,
|
|
||||||
events_receiver: watch::Receiver<WalConnectionEvent>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl WalReceiverConnection {
|
|
||||||
/// Initializes the connection task, returning a set of handles on top of it.
|
|
||||||
/// The task is started immediately after the creation, fails if no connection is established during the timeout given.
|
|
||||||
pub fn open(
|
|
||||||
id: ZTenantTimelineId,
|
|
||||||
safekeeper_id: NodeId,
|
|
||||||
wal_producer_connstr: String,
|
|
||||||
connect_timeout: Duration,
|
|
||||||
) -> Self {
|
|
||||||
let (cancellation, mut cancellation_receiver) = watch::channel(());
|
|
||||||
let (events_sender, events_receiver) = watch::channel(WalConnectionEvent::Started);
|
|
||||||
|
|
||||||
let handle = tokio::spawn(
|
|
||||||
async move {
|
|
||||||
let connection_result = handle_walreceiver_connection(
|
|
||||||
id,
|
|
||||||
&wal_producer_connstr,
|
|
||||||
&events_sender,
|
|
||||||
&mut cancellation_receiver,
|
|
||||||
connect_timeout,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.map_err(|e| {
|
|
||||||
format!("Walreceiver connection for id {id} failed with error: {e:#}")
|
|
||||||
});
|
|
||||||
|
|
||||||
match &connection_result {
|
|
||||||
Ok(()) => {
|
|
||||||
debug!("Walreceiver connection for id {id} ended successfully")
|
|
||||||
}
|
|
||||||
Err(e) => warn!("{e}"),
|
|
||||||
}
|
|
||||||
events_sender
|
|
||||||
.send(WalConnectionEvent::End(connection_result))
|
|
||||||
.ok();
|
|
||||||
}
|
|
||||||
.instrument(info_span!("safekeeper_handle", sk = %safekeeper_id)),
|
|
||||||
);
|
|
||||||
|
|
||||||
Self {
|
|
||||||
handle,
|
|
||||||
cancellation,
|
|
||||||
events_receiver,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Polls for the next WAL receiver event, if there's any available since the last check.
|
|
||||||
/// Blocks if there's no new event available, returns `None` if no new events will ever occur.
|
|
||||||
/// Only the last event is returned, all events received between observatins are lost.
|
|
||||||
pub async fn next_event(&mut self) -> Option<WalConnectionEvent> {
|
|
||||||
match self.events_receiver.changed().await {
|
|
||||||
Ok(()) => Some(self.events_receiver.borrow().clone()),
|
|
||||||
Err(_cancellation_error) => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Gracefully aborts current WAL streaming task, waiting for the current WAL streamed.
|
|
||||||
pub async fn shutdown(&mut self) -> anyhow::Result<()> {
|
|
||||||
self.cancellation.send(()).ok();
|
|
||||||
let handle = &mut self.handle;
|
|
||||||
handle
|
|
||||||
.await
|
|
||||||
.context("Failed to join on a walreceiver connection task")?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_walreceiver_connection(
|
|
||||||
id: ZTenantTimelineId,
|
id: ZTenantTimelineId,
|
||||||
wal_producer_connstr: &str,
|
wal_producer_connstr: &str,
|
||||||
events_sender: &watch::Sender<WalConnectionEvent>,
|
events_sender: &watch::Sender<TaskEvent<ReplicationFeedback>>,
|
||||||
cancellation: &mut watch::Receiver<()>,
|
mut cancellation: watch::Receiver<()>,
|
||||||
connect_timeout: Duration,
|
connect_timeout: Duration,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Connect to the database in replication mode.
|
// Connect to the database in replication mode.
|
||||||
@@ -214,8 +130,6 @@ async fn handle_walreceiver_connection(
|
|||||||
|
|
||||||
while let Some(replication_message) = {
|
while let Some(replication_message) = {
|
||||||
select! {
|
select! {
|
||||||
// check for shutdown first
|
|
||||||
biased;
|
|
||||||
_ = cancellation.changed() => {
|
_ = cancellation.changed() => {
|
||||||
info!("walreceiver interrupted");
|
info!("walreceiver interrupted");
|
||||||
None
|
None
|
||||||
@@ -328,7 +242,7 @@ async fn handle_walreceiver_connection(
|
|||||||
|
|
||||||
// Send zenith feedback message.
|
// Send zenith feedback message.
|
||||||
// Regular standby_status_update fields are put into this message.
|
// Regular standby_status_update fields are put into this message.
|
||||||
let zenith_status_update = ZenithFeedback {
|
let zenith_status_update = ReplicationFeedback {
|
||||||
current_timeline_size: timeline.get_current_logical_size() as u64,
|
current_timeline_size: timeline.get_current_logical_size() as u64,
|
||||||
ps_writelsn: write_lsn,
|
ps_writelsn: write_lsn,
|
||||||
ps_flushlsn: flush_lsn,
|
ps_flushlsn: flush_lsn,
|
||||||
@@ -344,7 +258,7 @@ async fn handle_walreceiver_connection(
|
|||||||
.as_mut()
|
.as_mut()
|
||||||
.zenith_status_update(data.len() as u64, &data)
|
.zenith_status_update(data.len() as u64, &data)
|
||||||
.await?;
|
.await?;
|
||||||
if let Err(e) = events_sender.send(WalConnectionEvent::NewWal(zenith_status_update)) {
|
if let Err(e) = events_sender.send(TaskEvent::NewEvent(zenith_status_update)) {
|
||||||
warn!("Wal connection event listener dropped, aborting the connection: {e}");
|
warn!("Wal connection event listener dropped, aborting the connection: {e}");
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
@@ -623,6 +623,7 @@ impl PostgresRedoProcess {
|
|||||||
.env_clear()
|
.env_clear()
|
||||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir())
|
.env("LD_LIBRARY_PATH", conf.pg_lib_dir())
|
||||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir())
|
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir())
|
||||||
|
.close_fds()
|
||||||
.output()
|
.output()
|
||||||
.map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?;
|
.map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?;
|
||||||
|
|
||||||
|
|||||||
@@ -39,6 +39,8 @@ utils = { path = "../libs/utils" }
|
|||||||
metrics = { path = "../libs/metrics" }
|
metrics = { path = "../libs/metrics" }
|
||||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
|
|
||||||
|
x509-parser = "0.13.2"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
rcgen = "0.8.14"
|
rcgen = "0.8.14"
|
||||||
rstest = "0.12"
|
rstest = "0.12"
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ pub type Result<T> = std::result::Result<T, ConsoleAuthError>;
|
|||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
pub enum ConsoleAuthError {
|
pub enum ConsoleAuthError {
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
BadProjectName(#[from] auth::credentials::ProjectNameError),
|
BadProjectName(#[from] auth::credentials::ClientCredsParseError),
|
||||||
|
|
||||||
// We shouldn't include the actual secret here.
|
// We shouldn't include the actual secret here.
|
||||||
#[error("Bad authentication secret")]
|
#[error("Bad authentication secret")]
|
||||||
@@ -49,6 +49,12 @@ impl UserFacingError for ConsoleAuthError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<&auth::credentials::ClientCredsParseError> for ConsoleAuthError {
|
||||||
|
fn from(e: &auth::credentials::ClientCredsParseError) -> Self {
|
||||||
|
ConsoleAuthError::BadProjectName(e.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: convert into an enum with "error"
|
// TODO: convert into an enum with "error"
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
struct GetRoleSecretResponse {
|
struct GetRoleSecretResponse {
|
||||||
@@ -74,18 +80,12 @@ pub enum AuthInfo {
|
|||||||
pub(super) struct Api<'a> {
|
pub(super) struct Api<'a> {
|
||||||
endpoint: &'a ApiUrl,
|
endpoint: &'a ApiUrl,
|
||||||
creds: &'a ClientCredentials,
|
creds: &'a ClientCredentials,
|
||||||
/// Cache project name, since we'll need it several times.
|
|
||||||
project: &'a str,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Api<'a> {
|
impl<'a> Api<'a> {
|
||||||
/// Construct an API object containing the auth parameters.
|
/// Construct an API object containing the auth parameters.
|
||||||
pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result<Self> {
|
pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result<Self> {
|
||||||
Ok(Self {
|
Ok(Self { endpoint, creds })
|
||||||
endpoint,
|
|
||||||
creds,
|
|
||||||
project: creds.project_name()?,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Authenticate the existing user or throw an error.
|
/// Authenticate the existing user or throw an error.
|
||||||
@@ -100,7 +100,7 @@ impl<'a> Api<'a> {
|
|||||||
let mut url = self.endpoint.clone();
|
let mut url = self.endpoint.clone();
|
||||||
url.path_segments_mut().push("proxy_get_role_secret");
|
url.path_segments_mut().push("proxy_get_role_secret");
|
||||||
url.query_pairs_mut()
|
url.query_pairs_mut()
|
||||||
.append_pair("project", self.project)
|
.append_pair("project", self.creds.project_name.as_ref()?)
|
||||||
.append_pair("role", &self.creds.user);
|
.append_pair("role", &self.creds.user);
|
||||||
|
|
||||||
// TODO: use a proper logger
|
// TODO: use a proper logger
|
||||||
@@ -123,7 +123,8 @@ impl<'a> Api<'a> {
|
|||||||
async fn wake_compute(&self) -> Result<DatabaseInfo> {
|
async fn wake_compute(&self) -> Result<DatabaseInfo> {
|
||||||
let mut url = self.endpoint.clone();
|
let mut url = self.endpoint.clone();
|
||||||
url.path_segments_mut().push("proxy_wake_compute");
|
url.path_segments_mut().push("proxy_wake_compute");
|
||||||
url.query_pairs_mut().append_pair("project", self.project);
|
let project_name = self.creds.project_name.as_ref()?;
|
||||||
|
url.query_pairs_mut().append_pair("project", project_name);
|
||||||
|
|
||||||
// TODO: use a proper logger
|
// TODO: use a proper logger
|
||||||
println!("cplane request: {url}");
|
println!("cplane request: {url}");
|
||||||
|
|||||||
@@ -8,10 +8,32 @@ use std::collections::HashMap;
|
|||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error, PartialEq, Eq, Clone)]
|
||||||
pub enum ClientCredsParseError {
|
pub enum ClientCredsParseError {
|
||||||
#[error("Parameter `{0}` is missing in startup packet")]
|
#[error("Parameter `{0}` is missing in startup packet.")]
|
||||||
MissingKey(&'static str),
|
MissingKey(&'static str),
|
||||||
|
|
||||||
|
#[error(
|
||||||
|
"Project name is not specified. \
|
||||||
|
EITHER please upgrade the postgres client library (libpq) for SNI support \
|
||||||
|
OR pass the project name as a parameter: '&options=project%3D<project-name>'."
|
||||||
|
)]
|
||||||
|
MissingSNIAndProjectName,
|
||||||
|
|
||||||
|
#[error("Inconsistent project name inferred from SNI ('{0}') and project option ('{1}').")]
|
||||||
|
InconsistentProjectNameAndSNI(String, String),
|
||||||
|
|
||||||
|
#[error("Common name is not set.")]
|
||||||
|
CommonNameNotSet,
|
||||||
|
|
||||||
|
#[error(
|
||||||
|
"SNI ('{1}') inconsistently formatted with respect to common name ('{0}'). \
|
||||||
|
SNI should be formatted as '<project-name>.<common-name>'."
|
||||||
|
)]
|
||||||
|
InconsistentCommonNameAndSNI(String, String),
|
||||||
|
|
||||||
|
#[error("Project name ('{0}') must contain only alphanumeric characters and hyphens ('-').")]
|
||||||
|
ProjectNameContainsIllegalChars(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl UserFacingError for ClientCredsParseError {}
|
impl UserFacingError for ClientCredsParseError {}
|
||||||
@@ -22,15 +44,7 @@ impl UserFacingError for ClientCredsParseError {}
|
|||||||
pub struct ClientCredentials {
|
pub struct ClientCredentials {
|
||||||
pub user: String,
|
pub user: String,
|
||||||
pub dbname: String,
|
pub dbname: String,
|
||||||
|
pub project_name: Result<String, ClientCredsParseError>,
|
||||||
// New console API requires SNI info to determine the cluster name.
|
|
||||||
// Other Auth backends don't need it.
|
|
||||||
pub sni_data: Option<String>,
|
|
||||||
|
|
||||||
// project_name is passed as argument from options from url.
|
|
||||||
// In case sni_data is missing: project_name is used to determine cluster name.
|
|
||||||
// In case sni_data is available: project_name and sni_data should match (otherwise throws an error).
|
|
||||||
pub project_name: Option<String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ClientCredentials {
|
impl ClientCredentials {
|
||||||
@@ -38,60 +52,14 @@ impl ClientCredentials {
|
|||||||
// This logic will likely change in the future.
|
// This logic will likely change in the future.
|
||||||
self.user.ends_with("@zenith")
|
self.user.ends_with("@zenith")
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
pub fn parse(
|
||||||
pub enum ProjectNameError {
|
mut options: HashMap<String, String>,
|
||||||
#[error("SNI is missing. EITHER please upgrade the postgres client library OR pass the project name as a parameter: '...&options=project%3D<project-name>...'.")]
|
sni_data: Option<&str>,
|
||||||
Missing,
|
common_name: Option<&str>,
|
||||||
|
) -> Result<Self, ClientCredsParseError> {
|
||||||
#[error("SNI is malformed.")]
|
|
||||||
Bad,
|
|
||||||
|
|
||||||
#[error("Inconsistent project name inferred from SNI and project option. String from SNI: '{0}', String from project option: '{1}'")]
|
|
||||||
Inconsistent(String, String),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl UserFacingError for ProjectNameError {}
|
|
||||||
|
|
||||||
impl ClientCredentials {
|
|
||||||
/// Determine project name from SNI or from project_name parameter from options argument.
|
|
||||||
pub fn project_name(&self) -> Result<&str, ProjectNameError> {
|
|
||||||
// Checking that if both sni_data and project_name are set, then they should match
|
|
||||||
// otherwise, throws a ProjectNameError::Inconsistent error.
|
|
||||||
if let Some(sni_data) = &self.sni_data {
|
|
||||||
let project_name_from_sni_data =
|
|
||||||
sni_data.split_once('.').ok_or(ProjectNameError::Bad)?.0;
|
|
||||||
if let Some(project_name_from_options) = &self.project_name {
|
|
||||||
if !project_name_from_options.eq(project_name_from_sni_data) {
|
|
||||||
return Err(ProjectNameError::Inconsistent(
|
|
||||||
project_name_from_sni_data.to_string(),
|
|
||||||
project_name_from_options.to_string(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// determine the project name from self.sni_data if it exists, otherwise from self.project_name.
|
|
||||||
let ret = match &self.sni_data {
|
|
||||||
// if sni_data exists, use it to determine project name
|
|
||||||
Some(sni_data) => sni_data.split_once('.').ok_or(ProjectNameError::Bad)?.0,
|
|
||||||
// otherwise use project_option if it was manually set thought options parameter.
|
|
||||||
None => self
|
|
||||||
.project_name
|
|
||||||
.as_ref()
|
|
||||||
.ok_or(ProjectNameError::Missing)?
|
|
||||||
.as_str(),
|
|
||||||
};
|
|
||||||
Ok(ret)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TryFrom<HashMap<String, String>> for ClientCredentials {
|
|
||||||
type Error = ClientCredsParseError;
|
|
||||||
|
|
||||||
fn try_from(mut value: HashMap<String, String>) -> Result<Self, Self::Error> {
|
|
||||||
let mut get_param = |key| {
|
let mut get_param = |key| {
|
||||||
value
|
options
|
||||||
.remove(key)
|
.remove(key)
|
||||||
.ok_or(ClientCredsParseError::MissingKey(key))
|
.ok_or(ClientCredsParseError::MissingKey(key))
|
||||||
};
|
};
|
||||||
@@ -99,17 +67,15 @@ impl TryFrom<HashMap<String, String>> for ClientCredentials {
|
|||||||
let user = get_param("user")?;
|
let user = get_param("user")?;
|
||||||
let dbname = get_param("database")?;
|
let dbname = get_param("database")?;
|
||||||
let project_name = get_param("project").ok();
|
let project_name = get_param("project").ok();
|
||||||
|
let project_name = get_project_name(sni_data, common_name, project_name.as_deref());
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
user,
|
user,
|
||||||
dbname,
|
dbname,
|
||||||
sni_data: None,
|
|
||||||
project_name,
|
project_name,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl ClientCredentials {
|
|
||||||
/// Use credentials to authenticate the user.
|
/// Use credentials to authenticate the user.
|
||||||
pub async fn authenticate(
|
pub async fn authenticate(
|
||||||
self,
|
self,
|
||||||
@@ -120,3 +86,244 @@ impl ClientCredentials {
|
|||||||
super::backend::handle_user(config, client, self).await
|
super::backend::handle_user(config, client, self).await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Inferring project name from sni_data.
|
||||||
|
fn project_name_from_sni_data(
|
||||||
|
sni_data: &str,
|
||||||
|
common_name: &str,
|
||||||
|
) -> Result<String, ClientCredsParseError> {
|
||||||
|
let common_name_with_dot = format!(".{common_name}");
|
||||||
|
// check that ".{common_name_with_dot}" is the actual suffix in sni_data
|
||||||
|
if !sni_data.ends_with(&common_name_with_dot) {
|
||||||
|
return Err(ClientCredsParseError::InconsistentCommonNameAndSNI(
|
||||||
|
common_name.to_string(),
|
||||||
|
sni_data.to_string(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
// return sni_data without the common name suffix.
|
||||||
|
Ok(sni_data
|
||||||
|
.strip_suffix(&common_name_with_dot)
|
||||||
|
.unwrap()
|
||||||
|
.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests_for_project_name_from_sni_data {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn passing() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let sni_data = format!("{target_project_name}.{common_name}");
|
||||||
|
assert_eq!(
|
||||||
|
project_name_from_sni_data(&sni_data, common_name),
|
||||||
|
Ok(target_project_name.to_string())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn throws_inconsistent_common_name_and_sni_data() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let wrong_suffix = "wrongtest.me";
|
||||||
|
assert_eq!(common_name.len(), wrong_suffix.len());
|
||||||
|
let wrong_common_name = format!("wrong{wrong_suffix}");
|
||||||
|
let sni_data = format!("{target_project_name}.{wrong_common_name}");
|
||||||
|
assert_eq!(
|
||||||
|
project_name_from_sni_data(&sni_data, common_name),
|
||||||
|
Err(ClientCredsParseError::InconsistentCommonNameAndSNI(
|
||||||
|
common_name.to_string(),
|
||||||
|
sni_data
|
||||||
|
))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Determine project name from SNI or from project_name parameter from options argument.
|
||||||
|
fn get_project_name(
|
||||||
|
sni_data: Option<&str>,
|
||||||
|
common_name: Option<&str>,
|
||||||
|
project_name: Option<&str>,
|
||||||
|
) -> Result<String, ClientCredsParseError> {
|
||||||
|
// determine the project name from sni_data if it exists, otherwise from project_name.
|
||||||
|
let ret = match sni_data {
|
||||||
|
Some(sni_data) => {
|
||||||
|
let common_name = common_name.ok_or(ClientCredsParseError::CommonNameNotSet)?;
|
||||||
|
let project_name_from_sni = project_name_from_sni_data(sni_data, common_name)?;
|
||||||
|
// check invariant: project name from options and from sni should match
|
||||||
|
if let Some(project_name) = &project_name {
|
||||||
|
if !project_name_from_sni.eq(project_name) {
|
||||||
|
return Err(ClientCredsParseError::InconsistentProjectNameAndSNI(
|
||||||
|
project_name_from_sni,
|
||||||
|
project_name.to_string(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
project_name_from_sni
|
||||||
|
}
|
||||||
|
None => project_name
|
||||||
|
.ok_or(ClientCredsParseError::MissingSNIAndProjectName)?
|
||||||
|
.to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// check formatting invariant: project name must contain only alphanumeric characters and hyphens.
|
||||||
|
if !ret.chars().all(|x: char| x.is_alphanumeric() || x == '-') {
|
||||||
|
return Err(ClientCredsParseError::ProjectNameContainsIllegalChars(ret));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(ret)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests_for_project_name_only {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn passing_from_sni_data_only() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let sni_data = format!("{target_project_name}.{common_name}");
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(Some(&sni_data), Some(common_name), None),
|
||||||
|
Ok(target_project_name.to_string())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn throws_project_name_contains_illegal_chars_from_sni_data_only() {
|
||||||
|
let project_name_prefix = "my-project";
|
||||||
|
let project_name_suffix = "123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
|
||||||
|
for illegal_char_id in 0..256 {
|
||||||
|
let illegal_char = char::from_u32(illegal_char_id).unwrap();
|
||||||
|
if !(illegal_char.is_alphanumeric() || illegal_char == '-')
|
||||||
|
&& illegal_char.to_string().len() == 1
|
||||||
|
{
|
||||||
|
let target_project_name =
|
||||||
|
format!("{project_name_prefix}{illegal_char}{project_name_suffix}");
|
||||||
|
let sni_data = format!("{target_project_name}.{common_name}");
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(Some(&sni_data), Some(common_name), None),
|
||||||
|
Err(ClientCredsParseError::ProjectNameContainsIllegalChars(
|
||||||
|
target_project_name
|
||||||
|
))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn passing_from_project_name_only() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let common_names = [Some("localtest.me"), None];
|
||||||
|
for common_name in common_names {
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(None, common_name, Some(target_project_name)),
|
||||||
|
Ok(target_project_name.to_string())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn throws_project_name_contains_illegal_chars_from_project_name_only() {
|
||||||
|
let project_name_prefix = "my-project";
|
||||||
|
let project_name_suffix = "123";
|
||||||
|
let common_names = [Some("localtest.me"), None];
|
||||||
|
|
||||||
|
for common_name in common_names {
|
||||||
|
for illegal_char_id in 0..256 {
|
||||||
|
let illegal_char: char = char::from_u32(illegal_char_id).unwrap();
|
||||||
|
if !(illegal_char.is_alphanumeric() || illegal_char == '-')
|
||||||
|
&& illegal_char.to_string().len() == 1
|
||||||
|
{
|
||||||
|
let target_project_name =
|
||||||
|
format!("{project_name_prefix}{illegal_char}{project_name_suffix}");
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(None, common_name, Some(&target_project_name)),
|
||||||
|
Err(ClientCredsParseError::ProjectNameContainsIllegalChars(
|
||||||
|
target_project_name
|
||||||
|
))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn passing_from_sni_data_and_project_name() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let sni_data = format!("{target_project_name}.{common_name}");
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(
|
||||||
|
Some(&sni_data),
|
||||||
|
Some(common_name),
|
||||||
|
Some(target_project_name)
|
||||||
|
),
|
||||||
|
Ok(target_project_name.to_string())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn throws_inconsistent_project_name_and_sni() {
|
||||||
|
let project_name_param = "my-project-123";
|
||||||
|
let wrong_project_name = "not-my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let sni_data = format!("{wrong_project_name}.{common_name}");
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(Some(&sni_data), Some(common_name), Some(project_name_param)),
|
||||||
|
Err(ClientCredsParseError::InconsistentProjectNameAndSNI(
|
||||||
|
wrong_project_name.to_string(),
|
||||||
|
project_name_param.to_string()
|
||||||
|
))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn throws_common_name_not_set() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let wrong_project_name = "not-my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let sni_datas = [
|
||||||
|
Some(format!("{wrong_project_name}.{common_name}")),
|
||||||
|
Some(format!("{target_project_name}.{common_name}")),
|
||||||
|
];
|
||||||
|
let project_names = [None, Some(target_project_name)];
|
||||||
|
for sni_data in sni_datas {
|
||||||
|
for project_name_param in project_names {
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(sni_data.as_deref(), None, project_name_param),
|
||||||
|
Err(ClientCredsParseError::CommonNameNotSet)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn throws_inconsistent_common_name_and_sni_data() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let wrong_project_name = "not-my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let wrong_suffix = "wrongtest.me";
|
||||||
|
assert_eq!(common_name.len(), wrong_suffix.len());
|
||||||
|
let wrong_common_name = format!("wrong{wrong_suffix}");
|
||||||
|
let sni_datas = [
|
||||||
|
Some(format!("{wrong_project_name}.{wrong_common_name}")),
|
||||||
|
Some(format!("{target_project_name}.{wrong_common_name}")),
|
||||||
|
];
|
||||||
|
let project_names = [None, Some(target_project_name)];
|
||||||
|
for project_name_param in project_names {
|
||||||
|
for sni_data in &sni_datas {
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(sni_data.as_deref(), Some(common_name), project_name_param),
|
||||||
|
Err(ClientCredsParseError::InconsistentCommonNameAndSNI(
|
||||||
|
common_name.to_string(),
|
||||||
|
sni_data.clone().unwrap().to_string()
|
||||||
|
))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -36,23 +36,35 @@ pub struct ProxyConfig {
|
|||||||
pub auth_link_uri: ApiUrl,
|
pub auth_link_uri: ApiUrl,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type TlsConfig = Arc<rustls::ServerConfig>;
|
pub struct TlsConfig {
|
||||||
|
pub config: Arc<rustls::ServerConfig>,
|
||||||
|
pub common_name: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TlsConfig {
|
||||||
|
pub fn to_server_config(&self) -> Arc<rustls::ServerConfig> {
|
||||||
|
self.config.clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Configure TLS for the main endpoint.
|
/// Configure TLS for the main endpoint.
|
||||||
pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfig> {
|
pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfig> {
|
||||||
let key = {
|
let key = {
|
||||||
let key_bytes = std::fs::read(key_path).context("TLS key file")?;
|
let key_bytes = std::fs::read(key_path).context("TLS key file")?;
|
||||||
let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||||
.context("couldn't read TLS keys")?;
|
.context(format!("Failed to read TLS keys at '{key_path}'"))?;
|
||||||
|
|
||||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||||
keys.pop().map(rustls::PrivateKey).unwrap()
|
keys.pop().map(rustls::PrivateKey).unwrap()
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let cert_chain_bytes = std::fs::read(cert_path)
|
||||||
|
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
|
||||||
let cert_chain = {
|
let cert_chain = {
|
||||||
let cert_chain_bytes = std::fs::read(cert_path).context("TLS cert file")?;
|
|
||||||
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
|
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
|
||||||
.context("couldn't read TLS certificate chain")?
|
.context(format!(
|
||||||
|
"Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
|
||||||
|
))?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(rustls::Certificate)
|
.map(rustls::Certificate)
|
||||||
.collect()
|
.collect()
|
||||||
@@ -64,7 +76,25 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfi
|
|||||||
// allow TLS 1.2 to be compatible with older client libraries
|
// allow TLS 1.2 to be compatible with older client libraries
|
||||||
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
|
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
|
||||||
.with_no_client_auth()
|
.with_no_client_auth()
|
||||||
.with_single_cert(cert_chain, key)?;
|
.with_single_cert(cert_chain, key)?
|
||||||
|
.into();
|
||||||
|
|
||||||
Ok(config.into())
|
// determine common name from tls-cert (-c server.crt param).
|
||||||
|
// used in asserting project name formatting invariant.
|
||||||
|
let common_name = {
|
||||||
|
let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
|
||||||
|
.context(format!(
|
||||||
|
"Failed to parse PEM object from bytes from file at '{cert_path}'."
|
||||||
|
))?
|
||||||
|
.1;
|
||||||
|
let almost_common_name = pem.parse_x509()?.tbs_certificate.subject.to_string();
|
||||||
|
let expected_prefix = "CN=*.";
|
||||||
|
let common_name = almost_common_name.strip_prefix(expected_prefix);
|
||||||
|
common_name.map(str::to_string)
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(TlsConfig {
|
||||||
|
config,
|
||||||
|
common_name,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ async fn handle_client(
|
|||||||
NUM_CONNECTIONS_CLOSED_COUNTER.inc();
|
NUM_CONNECTIONS_CLOSED_COUNTER.inc();
|
||||||
}
|
}
|
||||||
|
|
||||||
let tls = config.tls_config.clone();
|
let tls = config.tls_config.as_ref();
|
||||||
let (stream, creds) = match handshake(stream, tls, cancel_map).await? {
|
let (stream, creds) = match handshake(stream, tls, cancel_map).await? {
|
||||||
Some(x) => x,
|
Some(x) => x,
|
||||||
None => return Ok(()), // it's a cancellation request
|
None => return Ok(()), // it's a cancellation request
|
||||||
@@ -99,12 +99,14 @@ async fn handle_client(
|
|||||||
/// we also take an extra care of propagating only the select handshake errors to client.
|
/// we also take an extra care of propagating only the select handshake errors to client.
|
||||||
async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||||
stream: S,
|
stream: S,
|
||||||
mut tls: Option<TlsConfig>,
|
mut tls: Option<&TlsConfig>,
|
||||||
cancel_map: &CancelMap,
|
cancel_map: &CancelMap,
|
||||||
) -> anyhow::Result<Option<(PqStream<Stream<S>>, auth::ClientCredentials)>> {
|
) -> anyhow::Result<Option<(PqStream<Stream<S>>, auth::ClientCredentials)>> {
|
||||||
// Client may try upgrading to each protocol only once
|
// Client may try upgrading to each protocol only once
|
||||||
let (mut tried_ssl, mut tried_gss) = (false, false);
|
let (mut tried_ssl, mut tried_gss) = (false, false);
|
||||||
|
|
||||||
|
let common_name = tls.and_then(|cfg| cfg.common_name.as_deref());
|
||||||
|
|
||||||
let mut stream = PqStream::new(Stream::from_raw(stream));
|
let mut stream = PqStream::new(Stream::from_raw(stream));
|
||||||
loop {
|
loop {
|
||||||
let msg = stream.read_startup_packet().await?;
|
let msg = stream.read_startup_packet().await?;
|
||||||
@@ -122,7 +124,9 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
|||||||
if let Some(tls) = tls.take() {
|
if let Some(tls) = tls.take() {
|
||||||
// Upgrade raw stream into a secure TLS-backed stream.
|
// Upgrade raw stream into a secure TLS-backed stream.
|
||||||
// NOTE: We've consumed `tls`; this fact will be used later.
|
// NOTE: We've consumed `tls`; this fact will be used later.
|
||||||
stream = PqStream::new(stream.into_inner().upgrade(tls).await?);
|
stream = PqStream::new(
|
||||||
|
stream.into_inner().upgrade(tls.to_server_config()).await?,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => bail!(ERR_PROTO_VIOLATION),
|
_ => bail!(ERR_PROTO_VIOLATION),
|
||||||
@@ -143,15 +147,16 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
|||||||
stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
|
stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Here and forth: `or_else` demands that we use a future here
|
// Get SNI info when available
|
||||||
let mut creds: auth::ClientCredentials = async { params.try_into() }
|
let sni_data = match stream.get_ref() {
|
||||||
.or_else(|e| stream.throw_error(e))
|
Stream::Tls { tls } => tls.get_ref().1.sni_hostname().map(|s| s.to_owned()),
|
||||||
.await?;
|
_ => None,
|
||||||
|
};
|
||||||
|
|
||||||
// Set SNI info when available
|
// Construct credentials
|
||||||
if let Stream::Tls { tls } = stream.get_ref() {
|
let creds =
|
||||||
creds.sni_data = tls.get_ref().1.sni_hostname().map(|s| s.to_owned());
|
auth::ClientCredentials::parse(params, sni_data.as_deref(), common_name);
|
||||||
}
|
let creds = async { creds }.or_else(|e| stream.throw_error(e)).await?;
|
||||||
|
|
||||||
break Ok(Some((stream, creds)));
|
break Ok(Some((stream, creds)));
|
||||||
}
|
}
|
||||||
@@ -264,12 +269,13 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Generate TLS certificates and build rustls configs for client and server.
|
/// Generate TLS certificates and build rustls configs for client and server.
|
||||||
fn generate_tls_config(
|
fn generate_tls_config<'a>(
|
||||||
hostname: &str,
|
hostname: &'a str,
|
||||||
) -> anyhow::Result<(ClientConfig<'_>, Arc<rustls::ServerConfig>)> {
|
common_name: &'a str,
|
||||||
|
) -> anyhow::Result<(ClientConfig<'a>, TlsConfig)> {
|
||||||
let (ca, cert, key) = generate_certs(hostname)?;
|
let (ca, cert, key) = generate_certs(hostname)?;
|
||||||
|
|
||||||
let server_config = {
|
let tls_config = {
|
||||||
let config = rustls::ServerConfig::builder()
|
let config = rustls::ServerConfig::builder()
|
||||||
.with_safe_defaults()
|
.with_safe_defaults()
|
||||||
.with_no_client_auth()
|
.with_no_client_auth()
|
||||||
@@ -291,7 +297,12 @@ mod tests {
|
|||||||
ClientConfig { config, hostname }
|
ClientConfig { config, hostname }
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok((client_config, server_config))
|
let tls_config = TlsConfig {
|
||||||
|
config: tls_config,
|
||||||
|
common_name: Some(common_name.to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok((client_config, tls_config))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
@@ -346,7 +357,7 @@ mod tests {
|
|||||||
auth: impl TestAuth + Send,
|
auth: impl TestAuth + Send,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let cancel_map = CancelMap::default();
|
let cancel_map = CancelMap::default();
|
||||||
let (mut stream, _creds) = handshake(client, tls, &cancel_map)
|
let (mut stream, _creds) = handshake(client, tls.as_ref(), &cancel_map)
|
||||||
.await?
|
.await?
|
||||||
.context("handshake failed")?;
|
.context("handshake failed")?;
|
||||||
|
|
||||||
@@ -365,7 +376,8 @@ mod tests {
|
|||||||
async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
|
async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
|
||||||
let (client, server) = tokio::io::duplex(1024);
|
let (client, server) = tokio::io::duplex(1024);
|
||||||
|
|
||||||
let (_, server_config) = generate_tls_config("localhost")?;
|
let (_, server_config) =
|
||||||
|
generate_tls_config("generic-project-name.localhost", "localhost")?;
|
||||||
let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
|
let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
|
||||||
|
|
||||||
let client_err = tokio_postgres::Config::new()
|
let client_err = tokio_postgres::Config::new()
|
||||||
@@ -393,7 +405,8 @@ mod tests {
|
|||||||
async fn handshake_tls() -> anyhow::Result<()> {
|
async fn handshake_tls() -> anyhow::Result<()> {
|
||||||
let (client, server) = tokio::io::duplex(1024);
|
let (client, server) = tokio::io::duplex(1024);
|
||||||
|
|
||||||
let (client_config, server_config) = generate_tls_config("localhost")?;
|
let (client_config, server_config) =
|
||||||
|
generate_tls_config("generic-project-name.localhost", "localhost")?;
|
||||||
let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
|
let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
|
||||||
|
|
||||||
let (_client, _conn) = tokio_postgres::Config::new()
|
let (_client, _conn) = tokio_postgres::Config::new()
|
||||||
@@ -415,6 +428,7 @@ mod tests {
|
|||||||
let (_client, _conn) = tokio_postgres::Config::new()
|
let (_client, _conn) = tokio_postgres::Config::new()
|
||||||
.user("john_doe")
|
.user("john_doe")
|
||||||
.dbname("earth")
|
.dbname("earth")
|
||||||
|
.options("project=generic-project-name")
|
||||||
.ssl_mode(SslMode::Prefer)
|
.ssl_mode(SslMode::Prefer)
|
||||||
.connect_raw(server, NoTls)
|
.connect_raw(server, NoTls)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -476,7 +490,8 @@ mod tests {
|
|||||||
async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
|
async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
|
||||||
let (client, server) = tokio::io::duplex(1024);
|
let (client, server) = tokio::io::duplex(1024);
|
||||||
|
|
||||||
let (client_config, server_config) = generate_tls_config("localhost")?;
|
let (client_config, server_config) =
|
||||||
|
generate_tls_config("generic-project-name.localhost", "localhost")?;
|
||||||
let proxy = tokio::spawn(dummy_proxy(
|
let proxy = tokio::spawn(dummy_proxy(
|
||||||
client,
|
client,
|
||||||
Some(server_config),
|
Some(server_config),
|
||||||
@@ -498,7 +513,8 @@ mod tests {
|
|||||||
async fn scram_auth_mock() -> anyhow::Result<()> {
|
async fn scram_auth_mock() -> anyhow::Result<()> {
|
||||||
let (client, server) = tokio::io::duplex(1024);
|
let (client, server) = tokio::io::duplex(1024);
|
||||||
|
|
||||||
let (client_config, server_config) = generate_tls_config("localhost")?;
|
let (client_config, server_config) =
|
||||||
|
generate_tls_config("generic-project-name.localhost", "localhost")?;
|
||||||
let proxy = tokio::spawn(dummy_proxy(
|
let proxy = tokio::spawn(dummy_proxy(
|
||||||
client,
|
client,
|
||||||
Some(server_config),
|
Some(server_config),
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user