mirror of
https://github.com/neondatabase/neon.git
synced 2026-03-18 07:40:37 +00:00
Compare commits
58 Commits
partitioni
...
projects_m
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
263a3ea5e3 | ||
|
|
bb69e0920c | ||
|
|
05f6a1394d | ||
|
|
844832ffe4 | ||
|
|
d29c545b5d | ||
|
|
6abdb12724 | ||
|
|
7898e72990 | ||
|
|
65704708fa | ||
|
|
6100a02d0f | ||
|
|
97fed38213 | ||
|
|
cadaca010c | ||
|
|
f09c09438a | ||
|
|
00fc696606 | ||
|
|
1d0706cf25 | ||
|
|
5ee19b0758 | ||
|
|
cef90d9220 | ||
|
|
4a05413a4c | ||
|
|
dd61f3558f | ||
|
|
8a714f1ebf | ||
|
|
137291dc24 | ||
|
|
eb8926083e | ||
|
|
26bca6ddba | ||
|
|
55192384c3 | ||
|
|
392cd8b1fc | ||
|
|
3cc531d093 | ||
|
|
84b9fcbbd5 | ||
|
|
93e050afe3 | ||
|
|
6d7dc384a5 | ||
|
|
3c2b03cd87 | ||
|
|
7c49abe7d1 | ||
|
|
d059e588a6 | ||
|
|
6222a0012b | ||
|
|
1ca28e6f3c | ||
|
|
6c4d6a2183 | ||
|
|
37465dafe3 | ||
|
|
ec0064c442 | ||
|
|
83c7e6ce52 | ||
|
|
f862373ac0 | ||
|
|
699f46cd84 | ||
|
|
36ee182d26 | ||
|
|
d11c9f9fcb | ||
|
|
d8a37452c8 | ||
|
|
e1336f451d | ||
|
|
a4d8261390 | ||
|
|
e2a5a31595 | ||
|
|
0ac0fba77a | ||
|
|
a001052cdd | ||
|
|
1f1d852204 | ||
|
|
f7b878611a | ||
|
|
a51b2dac9a | ||
|
|
e22d9cee3a | ||
|
|
a01999bc4a | ||
|
|
32e64afd54 | ||
|
|
8a53472e4f | ||
|
|
6e26588d17 | ||
|
|
0b93253b3c | ||
|
|
7dc6beacbd | ||
|
|
6cfebc096f |
@@ -6,5 +6,7 @@ timeout = 30
|
|||||||
|
|
||||||
[ssh_connection]
|
[ssh_connection]
|
||||||
ssh_args = -F ./ansible.ssh.cfg
|
ssh_args = -F ./ansible.ssh.cfg
|
||||||
scp_if_ssh = True
|
# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127
|
||||||
|
# and scp neither worked for me
|
||||||
|
transfer_method = piped
|
||||||
pipelining = True
|
pipelining = True
|
||||||
|
|||||||
@@ -1,3 +1,7 @@
|
|||||||
|
# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed
|
||||||
|
# (use pre 8.5 option name to cope with old ssh in CI)
|
||||||
|
PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com
|
||||||
|
|
||||||
Host tele.zenith.tech
|
Host tele.zenith.tech
|
||||||
User admin
|
User admin
|
||||||
Port 3023
|
Port 3023
|
||||||
|
|||||||
@@ -57,7 +57,7 @@
|
|||||||
args:
|
args:
|
||||||
creates: "/storage/pageserver/data/tenants"
|
creates: "/storage/pageserver/data/tenants"
|
||||||
environment:
|
environment:
|
||||||
ZENITH_REPO_DIR: "/storage/pageserver/data"
|
NEON_REPO_DIR: "/storage/pageserver/data"
|
||||||
LD_LIBRARY_PATH: "/usr/local/lib"
|
LD_LIBRARY_PATH: "/usr/local/lib"
|
||||||
become: true
|
become: true
|
||||||
tags:
|
tags:
|
||||||
@@ -131,7 +131,7 @@
|
|||||||
args:
|
args:
|
||||||
creates: "/storage/safekeeper/data/safekeeper.id"
|
creates: "/storage/safekeeper/data/safekeeper.id"
|
||||||
environment:
|
environment:
|
||||||
ZENITH_REPO_DIR: "/storage/safekeeper/data"
|
NEON_REPO_DIR: "/storage/safekeeper/data"
|
||||||
LD_LIBRARY_PATH: "/usr/local/lib"
|
LD_LIBRARY_PATH: "/usr/local/lib"
|
||||||
become: true
|
become: true
|
||||||
tags:
|
tags:
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ pageservers
|
|||||||
safekeepers
|
safekeepers
|
||||||
|
|
||||||
[storage:vars]
|
[storage:vars]
|
||||||
|
env_name = prod-1
|
||||||
console_mgmt_base_url = http://console-release.local
|
console_mgmt_base_url = http://console-release.local
|
||||||
bucket_name = zenith-storage-oregon
|
bucket_name = zenith-storage-oregon
|
||||||
bucket_region = us-west-2
|
bucket_region = us-west-2
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
[pageservers]
|
[pageservers]
|
||||||
#zenith-us-stage-ps-1 console_region_id=27
|
#zenith-us-stage-ps-1 console_region_id=27
|
||||||
zenith-us-stage-ps-2 console_region_id=27
|
zenith-us-stage-ps-2 console_region_id=27
|
||||||
|
zenith-us-stage-ps-3 console_region_id=27
|
||||||
|
|
||||||
[safekeepers]
|
[safekeepers]
|
||||||
zenith-us-stage-sk-4 console_region_id=27
|
zenith-us-stage-sk-4 console_region_id=27
|
||||||
@@ -12,6 +13,7 @@ pageservers
|
|||||||
safekeepers
|
safekeepers
|
||||||
|
|
||||||
[storage:vars]
|
[storage:vars]
|
||||||
|
env_name = us-stage
|
||||||
console_mgmt_base_url = http://console-staging.local
|
console_mgmt_base_url = http://console-staging.local
|
||||||
bucket_name = zenith-staging-storage-us-east-1
|
bucket_name = zenith-staging-storage-us-east-1
|
||||||
bucket_region = us-east-1
|
bucket_region = us-east-1
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ After=network.target auditd.service
|
|||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
User=pageserver
|
User=pageserver
|
||||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
|
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
|
||||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
|
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
|
||||||
ExecReload=/bin/kill -HUP $MAINPID
|
ExecReload=/bin/kill -HUP $MAINPID
|
||||||
KillMode=mixed
|
KillMode=mixed
|
||||||
|
|||||||
@@ -5,8 +5,8 @@ After=network.target auditd.service
|
|||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
User=safekeeper
|
User=safekeeper
|
||||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
||||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}'
|
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
|
||||||
ExecReload=/bin/kill -HUP $MAINPID
|
ExecReload=/bin/kill -HUP $MAINPID
|
||||||
KillMode=mixed
|
KillMode=mixed
|
||||||
KillSignal=SIGINT
|
KillSignal=SIGINT
|
||||||
|
|||||||
@@ -100,10 +100,8 @@ jobs:
|
|||||||
name: Rust build << parameters.build_type >>
|
name: Rust build << parameters.build_type >>
|
||||||
command: |
|
command: |
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
|
||||||
CARGO_FLAGS=
|
CARGO_FLAGS=
|
||||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||||
cov_prefix=()
|
|
||||||
CARGO_FLAGS="--release --features profiling"
|
CARGO_FLAGS="--release --features profiling"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -112,7 +110,7 @@ jobs:
|
|||||||
export RUSTC_WRAPPER=cachepot
|
export RUSTC_WRAPPER=cachepot
|
||||||
export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
|
export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
|
||||||
export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
|
export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
|
||||||
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||||
cachepot -s
|
cachepot -s
|
||||||
|
|
||||||
- save_cache:
|
- save_cache:
|
||||||
@@ -128,32 +126,24 @@ jobs:
|
|||||||
name: cargo test
|
name: cargo test
|
||||||
command: |
|
command: |
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
|
||||||
CARGO_FLAGS=
|
CARGO_FLAGS=
|
||||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||||
cov_prefix=()
|
|
||||||
CARGO_FLAGS=--release
|
CARGO_FLAGS=--release
|
||||||
fi
|
fi
|
||||||
|
|
||||||
"${cov_prefix[@]}" cargo test $CARGO_FLAGS
|
cargo test $CARGO_FLAGS
|
||||||
|
|
||||||
# Install the rust binaries, for use by test jobs
|
# Install the rust binaries, for use by test jobs
|
||||||
- run:
|
- run:
|
||||||
name: Install rust binaries
|
name: Install rust binaries
|
||||||
command: |
|
command: |
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
|
||||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
|
||||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
|
||||||
cov_prefix=()
|
|
||||||
fi
|
|
||||||
|
|
||||||
binaries=$(
|
binaries=$(
|
||||||
"${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
|
cargo metadata --format-version=1 --no-deps |
|
||||||
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
|
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
|
||||||
)
|
)
|
||||||
|
|
||||||
test_exe_paths=$(
|
test_exe_paths=$(
|
||||||
"${cov_prefix[@]}" cargo test --message-format=json --no-run |
|
cargo test --message-format=json --no-run |
|
||||||
jq -r '.executable | select(. != null)'
|
jq -r '.executable | select(. != null)'
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -166,34 +156,15 @@ jobs:
|
|||||||
SRC=target/$BUILD_TYPE/$bin
|
SRC=target/$BUILD_TYPE/$bin
|
||||||
DST=/tmp/zenith/bin/$bin
|
DST=/tmp/zenith/bin/$bin
|
||||||
cp $SRC $DST
|
cp $SRC $DST
|
||||||
echo $DST >> /tmp/zenith/etc/binaries.list
|
|
||||||
done
|
done
|
||||||
|
|
||||||
# Install test executables (for code coverage)
|
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
|
||||||
for bin in $test_exe_paths; do
|
|
||||||
SRC=$bin
|
|
||||||
DST=/tmp/zenith/test_bin/$(basename $bin)
|
|
||||||
cp $SRC $DST
|
|
||||||
echo $DST >> /tmp/zenith/etc/binaries.list
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install the postgres binaries, for use by test jobs
|
# Install the postgres binaries, for use by test jobs
|
||||||
- run:
|
- run:
|
||||||
name: Install postgres binaries
|
name: Install postgres binaries
|
||||||
command: |
|
command: |
|
||||||
cp -a tmp_install /tmp/zenith/pg_install
|
cp -a tmp_install /tmp/zenith/pg_install
|
||||||
|
|
||||||
- run:
|
# Save rust binaries for other jobs in the workflow
|
||||||
name: Merge coverage data
|
|
||||||
command: |
|
|
||||||
# This will speed up workspace uploads
|
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
|
||||||
scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Save the rust binaries and coverage data for other jobs in this workflow.
|
|
||||||
- persist_to_workspace:
|
- persist_to_workspace:
|
||||||
root: /tmp/zenith
|
root: /tmp/zenith
|
||||||
paths:
|
paths:
|
||||||
@@ -286,7 +257,7 @@ jobs:
|
|||||||
# no_output_timeout, specified here.
|
# no_output_timeout, specified here.
|
||||||
no_output_timeout: 10m
|
no_output_timeout: 10m
|
||||||
environment:
|
environment:
|
||||||
- ZENITH_BIN: /tmp/zenith/bin
|
- NEON_BIN: /tmp/zenith/bin
|
||||||
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
|
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
|
||||||
- TEST_OUTPUT: /tmp/test_output
|
- TEST_OUTPUT: /tmp/test_output
|
||||||
# this variable will be embedded in perf test report
|
# this variable will be embedded in perf test report
|
||||||
@@ -314,12 +285,6 @@ jobs:
|
|||||||
|
|
||||||
export GITHUB_SHA=$CIRCLE_SHA1
|
export GITHUB_SHA=$CIRCLE_SHA1
|
||||||
|
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
|
||||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
|
||||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
|
||||||
cov_prefix=()
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Run the tests.
|
# Run the tests.
|
||||||
#
|
#
|
||||||
# The junit.xml file allows CircleCI to display more fine-grained test information
|
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||||
@@ -330,7 +295,7 @@ jobs:
|
|||||||
# -n4 uses four processes to run tests via pytest-xdist
|
# -n4 uses four processes to run tests via pytest-xdist
|
||||||
# -s is not used to prevent pytest from capturing output, because tests are running
|
# -s is not used to prevent pytest from capturing output, because tests are running
|
||||||
# in parallel and logs are mixed between different tests
|
# in parallel and logs are mixed between different tests
|
||||||
"${cov_prefix[@]}" ./scripts/pytest \
|
./scripts/pytest \
|
||||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||||
--tb=short \
|
--tb=short \
|
||||||
--verbose \
|
--verbose \
|
||||||
@@ -359,67 +324,12 @@ jobs:
|
|||||||
# The store_test_results step tells CircleCI where to find the junit.xml file.
|
# The store_test_results step tells CircleCI where to find the junit.xml file.
|
||||||
- store_test_results:
|
- store_test_results:
|
||||||
path: /tmp/test_output
|
path: /tmp/test_output
|
||||||
- run:
|
# Save data (if any)
|
||||||
name: Merge coverage data
|
|
||||||
command: |
|
|
||||||
# This will speed up workspace uploads
|
|
||||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
|
||||||
scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
|
|
||||||
fi
|
|
||||||
# Save coverage data (if any)
|
|
||||||
- persist_to_workspace:
|
- persist_to_workspace:
|
||||||
root: /tmp/zenith
|
root: /tmp/zenith
|
||||||
paths:
|
paths:
|
||||||
- "*"
|
- "*"
|
||||||
|
|
||||||
coverage-report:
|
|
||||||
executor: neon-xlarge-executor
|
|
||||||
steps:
|
|
||||||
- attach_workspace:
|
|
||||||
at: /tmp/zenith
|
|
||||||
- checkout
|
|
||||||
- restore_cache:
|
|
||||||
name: Restore rust cache
|
|
||||||
keys:
|
|
||||||
# Require an exact match. While an out of date cache might speed up the build,
|
|
||||||
# there's no way to clean out old packages, so the cache grows every time something
|
|
||||||
# changes.
|
|
||||||
- v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
|
|
||||||
- run:
|
|
||||||
name: Build coverage report
|
|
||||||
command: |
|
|
||||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
|
||||||
|
|
||||||
scripts/coverage \
|
|
||||||
--dir=/tmp/zenith/coverage report \
|
|
||||||
--input-objects=/tmp/zenith/etc/binaries.list \
|
|
||||||
--commit-url=$COMMIT_URL \
|
|
||||||
--format=github
|
|
||||||
- run:
|
|
||||||
name: Upload coverage report
|
|
||||||
command: |
|
|
||||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
|
||||||
REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1
|
|
||||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
|
||||||
|
|
||||||
scripts/git-upload \
|
|
||||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \
|
|
||||||
--message="Add code coverage for $COMMIT_URL" \
|
|
||||||
copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
|
|
||||||
|
|
||||||
# Add link to the coverage report to the commit
|
|
||||||
curl -f -X POST \
|
|
||||||
https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
|
|
||||||
-H "Accept: application/vnd.github.v3+json" \
|
|
||||||
--user "$CI_ACCESS_TOKEN" \
|
|
||||||
--data \
|
|
||||||
"{
|
|
||||||
\"state\": \"success\",
|
|
||||||
\"context\": \"zenith-coverage\",
|
|
||||||
\"description\": \"Coverage report is ready\",
|
|
||||||
\"target_url\": \"$REPORT_URL\"
|
|
||||||
}"
|
|
||||||
|
|
||||||
# Build neondatabase/neon:latest image and push it to Docker hub
|
# Build neondatabase/neon:latest image and push it to Docker hub
|
||||||
docker-image:
|
docker-image:
|
||||||
docker:
|
docker:
|
||||||
@@ -688,50 +598,6 @@ jobs:
|
|||||||
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||||
|
|
||||||
# Trigger a new remote CI job
|
|
||||||
remote-ci-trigger:
|
|
||||||
docker:
|
|
||||||
- image: cimg/base:2021.04
|
|
||||||
parameters:
|
|
||||||
remote_repo:
|
|
||||||
type: string
|
|
||||||
environment:
|
|
||||||
REMOTE_REPO: << parameters.remote_repo >>
|
|
||||||
steps:
|
|
||||||
- run:
|
|
||||||
name: Set PR's status to pending
|
|
||||||
command: |
|
|
||||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
|
||||||
|
|
||||||
curl -f -X POST \
|
|
||||||
https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
|
|
||||||
-H "Accept: application/vnd.github.v3+json" \
|
|
||||||
--user "$CI_ACCESS_TOKEN" \
|
|
||||||
--data \
|
|
||||||
"{
|
|
||||||
\"state\": \"pending\",
|
|
||||||
\"context\": \"neon-cloud-e2e\",
|
|
||||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
|
||||||
}"
|
|
||||||
- run:
|
|
||||||
name: Request a remote CI test
|
|
||||||
command: |
|
|
||||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
|
||||||
|
|
||||||
curl -f -X POST \
|
|
||||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
|
||||||
-H "Accept: application/vnd.github.v3+json" \
|
|
||||||
--user "$CI_ACCESS_TOKEN" \
|
|
||||||
--data \
|
|
||||||
"{
|
|
||||||
\"ref\": \"main\",
|
|
||||||
\"inputs\": {
|
|
||||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
|
||||||
\"commit_hash\": \"$CIRCLE_SHA1\",
|
|
||||||
\"remote_repo\": \"$LOCAL_REPO\"
|
|
||||||
}
|
|
||||||
}"
|
|
||||||
|
|
||||||
workflows:
|
workflows:
|
||||||
build_and_test:
|
build_and_test:
|
||||||
jobs:
|
jobs:
|
||||||
@@ -774,12 +640,6 @@ workflows:
|
|||||||
save_perf_report: true
|
save_perf_report: true
|
||||||
requires:
|
requires:
|
||||||
- build-neon-release
|
- build-neon-release
|
||||||
- coverage-report:
|
|
||||||
# Context passes credentials for gh api
|
|
||||||
context: CI_ACCESS_TOKEN
|
|
||||||
requires:
|
|
||||||
# TODO: consider adding more
|
|
||||||
- other-tests-debug
|
|
||||||
- docker-image:
|
- docker-image:
|
||||||
# Context gives an ability to login
|
# Context gives an ability to login
|
||||||
context: Docker Hub
|
context: Docker Hub
|
||||||
@@ -880,14 +740,3 @@ workflows:
|
|||||||
- release
|
- release
|
||||||
requires:
|
requires:
|
||||||
- docker-image-release
|
- docker-image-release
|
||||||
- remote-ci-trigger:
|
|
||||||
# Context passes credentials for gh api
|
|
||||||
context: CI_ACCESS_TOKEN
|
|
||||||
remote_repo: "neondatabase/cloud"
|
|
||||||
requires:
|
|
||||||
# XXX: Successful build doesn't mean everything is OK, but
|
|
||||||
# the job to be triggered takes so much time to complete (~22 min)
|
|
||||||
# that it's better not to wait for the commented-out steps
|
|
||||||
- build-neon-release
|
|
||||||
# - pg_regress-tests-release
|
|
||||||
# - other-tests-release
|
|
||||||
|
|||||||
@@ -9,8 +9,8 @@ tmp_install
|
|||||||
tmp_check_cli
|
tmp_check_cli
|
||||||
test_output
|
test_output
|
||||||
.vscode
|
.vscode
|
||||||
.zenith
|
.neon
|
||||||
integration_tests/.zenith
|
integration_tests/.neon
|
||||||
.mypy_cache
|
.mypy_cache
|
||||||
|
|
||||||
Dockerfile
|
Dockerfile
|
||||||
|
|||||||
140
.github/actions/run-python-test-set/action.yml
vendored
Normal file
140
.github/actions/run-python-test-set/action.yml
vendored
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
name: 'Run python test'
|
||||||
|
description: 'Runs a Neon python test set, performing all the required preparations before'
|
||||||
|
|
||||||
|
inputs:
|
||||||
|
build_type:
|
||||||
|
description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".'
|
||||||
|
required: true
|
||||||
|
rust_toolchain:
|
||||||
|
description: 'Rust toolchain version to fetch the caches'
|
||||||
|
required: true
|
||||||
|
test_selection:
|
||||||
|
description: 'A python test suite to run'
|
||||||
|
required: true
|
||||||
|
extra_params:
|
||||||
|
description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr'
|
||||||
|
required: false
|
||||||
|
default: ''
|
||||||
|
needs_postgres_source:
|
||||||
|
description: 'Set to true if the test suite requires postgres source checked out'
|
||||||
|
required: false
|
||||||
|
default: 'false'
|
||||||
|
run_in_parallel:
|
||||||
|
description: 'Whether to run tests in parallel'
|
||||||
|
required: false
|
||||||
|
default: 'true'
|
||||||
|
save_perf_report:
|
||||||
|
description: 'Whether to upload the performance report'
|
||||||
|
required: false
|
||||||
|
default: 'false'
|
||||||
|
|
||||||
|
runs:
|
||||||
|
using: "composite"
|
||||||
|
steps:
|
||||||
|
- name: Get Neon artifact for restoration
|
||||||
|
uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact
|
||||||
|
path: ./neon-artifact/
|
||||||
|
|
||||||
|
- name: Extract Neon artifact
|
||||||
|
shell: bash -ex {0}
|
||||||
|
run: |
|
||||||
|
mkdir -p /tmp/neon/
|
||||||
|
tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
|
||||||
|
rm -rf ./neon-artifact/
|
||||||
|
|
||||||
|
- name: Checkout
|
||||||
|
if: inputs.needs_postgres_source == 'true'
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Cache poetry deps
|
||||||
|
id: cache_poetry
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: ~/.cache/pypoetry/virtualenvs
|
||||||
|
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||||
|
|
||||||
|
- name: Install Python deps
|
||||||
|
shell: bash -ex {0}
|
||||||
|
run: ./scripts/pysync
|
||||||
|
|
||||||
|
- name: Run pytest
|
||||||
|
env:
|
||||||
|
NEON_BIN: /tmp/neon/bin
|
||||||
|
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||||
|
TEST_OUTPUT: /tmp/test_output
|
||||||
|
# this variable will be embedded in perf test report
|
||||||
|
# and is needed to distinguish different environments
|
||||||
|
PLATFORM: github-actions-selfhosted
|
||||||
|
shell: bash -ex {0}
|
||||||
|
run: |
|
||||||
|
PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
|
||||||
|
rm -rf $PERF_REPORT_DIR
|
||||||
|
|
||||||
|
TEST_SELECTION="test_runner/${{ inputs.test_selection }}"
|
||||||
|
EXTRA_PARAMS="${{ inputs.extra_params }}"
|
||||||
|
if [ -z "$TEST_SELECTION" ]; then
|
||||||
|
echo "test_selection must be set"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
|
||||||
|
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
|
||||||
|
fi
|
||||||
|
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
|
||||||
|
if [[ "$GITHUB_REF" == "main" ]]; then
|
||||||
|
mkdir -p "$PERF_REPORT_DIR"
|
||||||
|
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||||
|
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||||
|
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||||
|
cov_prefix=()
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run the tests.
|
||||||
|
#
|
||||||
|
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||||
|
# in its "Tests" tab in the results page.
|
||||||
|
# --verbose prints name of each test (helpful when there are
|
||||||
|
# multiple tests in one file)
|
||||||
|
# -rA prints summary in the end
|
||||||
|
# -n4 uses four processes to run tests via pytest-xdist
|
||||||
|
# -s is not used to prevent pytest from capturing output, because tests are running
|
||||||
|
# in parallel and logs are mixed between different tests
|
||||||
|
"${cov_prefix[@]}" ./scripts/pytest \
|
||||||
|
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||||
|
--tb=short \
|
||||||
|
--verbose \
|
||||||
|
-m "not remote_cluster" \
|
||||||
|
-rA $TEST_SELECTION $EXTRA_PARAMS
|
||||||
|
|
||||||
|
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
|
||||||
|
if [[ "$GITHUB_REF" == "main" ]]; then
|
||||||
|
export REPORT_FROM="$PERF_REPORT_DIR"
|
||||||
|
export REPORT_TO=local
|
||||||
|
scripts/generate_and_push_perf_report.sh
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Delete all data but logs
|
||||||
|
shell: bash -ex {0}
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
du -sh /tmp/test_output/*
|
||||||
|
find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
|
||||||
|
du -sh /tmp/test_output/*
|
||||||
|
|
||||||
|
- name: Upload python test logs
|
||||||
|
if: always()
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
retention-days: 7
|
||||||
|
if-no-files-found: error
|
||||||
|
name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs
|
||||||
|
path: /tmp/test_output/
|
||||||
17
.github/actions/save-coverage-data/action.yml
vendored
Normal file
17
.github/actions/save-coverage-data/action.yml
vendored
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
name: 'Merge and upload coverage data'
|
||||||
|
description: 'Compresses and uploads the coverage data as an artifact'
|
||||||
|
|
||||||
|
runs:
|
||||||
|
using: "composite"
|
||||||
|
steps:
|
||||||
|
- name: Merge coverage data
|
||||||
|
shell: bash -ex {0}
|
||||||
|
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
|
||||||
|
|
||||||
|
- name: Upload coverage data
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
retention-days: 7
|
||||||
|
if-no-files-found: error
|
||||||
|
name: coverage-data-artifact
|
||||||
|
path: /tmp/coverage/
|
||||||
389
.github/workflows/build_and_test.yml
vendored
Normal file
389
.github/workflows/build_and_test.yml
vendored
Normal file
@@ -0,0 +1,389 @@
|
|||||||
|
name: Test
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash -ex {0}
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
RUST_BACKTRACE: 1
|
||||||
|
COPT: '-Werror'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-postgres:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ debug, release ]
|
||||||
|
rust_toolchain: [ 1.58 ]
|
||||||
|
|
||||||
|
env:
|
||||||
|
BUILD_TYPE: ${{ matrix.build_type }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Set pg revision for caching
|
||||||
|
id: pg_ver
|
||||||
|
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
|
||||||
|
|
||||||
|
- name: Cache postgres build
|
||||||
|
id: cache_pg
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: tmp_install/
|
||||||
|
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||||
|
|
||||||
|
- name: Build postgres
|
||||||
|
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||||
|
run: mold -run make postgres -j$(nproc)
|
||||||
|
|
||||||
|
# actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache
|
||||||
|
- name: Prepare postgres artifact
|
||||||
|
run: tar -C tmp_install/ -czf ./pg.tgz .
|
||||||
|
- name: Upload postgres artifact
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
retention-days: 7
|
||||||
|
if-no-files-found: error
|
||||||
|
name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact
|
||||||
|
path: ./pg.tgz
|
||||||
|
|
||||||
|
|
||||||
|
build-neon:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ build-postgres ]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ debug, release ]
|
||||||
|
rust_toolchain: [ 1.58 ]
|
||||||
|
|
||||||
|
env:
|
||||||
|
BUILD_TYPE: ${{ matrix.build_type }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Get postgres artifact for restoration
|
||||||
|
uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact
|
||||||
|
path: ./postgres-artifact/
|
||||||
|
- name: Extract postgres artifact
|
||||||
|
run: |
|
||||||
|
mkdir ./tmp_install/
|
||||||
|
tar -xf ./postgres-artifact/pg.tgz -C ./tmp_install/
|
||||||
|
rm -rf ./postgres-artifact/
|
||||||
|
|
||||||
|
- name: Cache cargo deps
|
||||||
|
id: cache_cargo
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/.cargo/registry/
|
||||||
|
~/.cargo/git/
|
||||||
|
target/
|
||||||
|
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||||
|
key: |
|
||||||
|
v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||||
|
v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
|
||||||
|
|
||||||
|
- name: Run cargo build
|
||||||
|
run: |
|
||||||
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
|
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||||
|
CARGO_FLAGS=
|
||||||
|
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||||
|
cov_prefix=()
|
||||||
|
CARGO_FLAGS="--release --features profiling"
|
||||||
|
fi
|
||||||
|
|
||||||
|
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||||
|
|
||||||
|
- name: Run cargo test
|
||||||
|
run: |
|
||||||
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
|
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||||
|
CARGO_FLAGS=
|
||||||
|
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||||
|
cov_prefix=()
|
||||||
|
CARGO_FLAGS=--release
|
||||||
|
fi
|
||||||
|
|
||||||
|
"${cov_prefix[@]}" cargo test $CARGO_FLAGS
|
||||||
|
|
||||||
|
- name: Install rust binaries
|
||||||
|
run: |
|
||||||
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
|
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||||
|
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||||
|
cov_prefix=()
|
||||||
|
fi
|
||||||
|
|
||||||
|
binaries=$(
|
||||||
|
"${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
|
||||||
|
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
|
||||||
|
)
|
||||||
|
|
||||||
|
test_exe_paths=$(
|
||||||
|
"${cov_prefix[@]}" cargo test --message-format=json --no-run |
|
||||||
|
jq -r '.executable | select(. != null)'
|
||||||
|
)
|
||||||
|
|
||||||
|
mkdir -p /tmp/neon/bin/
|
||||||
|
mkdir -p /tmp/neon/test_bin/
|
||||||
|
mkdir -p /tmp/neon/etc/
|
||||||
|
|
||||||
|
# Keep bloated coverage data files away from the rest of the artifact
|
||||||
|
mkdir -p /tmp/coverage/
|
||||||
|
|
||||||
|
# Install target binaries
|
||||||
|
for bin in $binaries; do
|
||||||
|
SRC=target/$BUILD_TYPE/$bin
|
||||||
|
DST=/tmp/neon/bin/$bin
|
||||||
|
cp "$SRC" "$DST"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Install test executables and write list of all binaries (for code coverage)
|
||||||
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
|
for bin in $binaries; do
|
||||||
|
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
|
||||||
|
done
|
||||||
|
for bin in $test_exe_paths; do
|
||||||
|
SRC=$bin
|
||||||
|
DST=/tmp/neon/test_bin/$(basename $bin)
|
||||||
|
cp "$SRC" "$DST"
|
||||||
|
echo "$DST" >> /tmp/coverage/binaries.list
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Install postgres binaries
|
||||||
|
run: cp -a tmp_install /tmp/neon/pg_install
|
||||||
|
|
||||||
|
- name: Prepare neon artifact
|
||||||
|
run: tar -C /tmp/neon/ -czf ./neon.tgz .
|
||||||
|
|
||||||
|
- name: Upload neon binaries
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
retention-days: 7
|
||||||
|
if-no-files-found: error
|
||||||
|
name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
|
||||||
|
path: ./neon.tgz
|
||||||
|
|
||||||
|
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
|
||||||
|
- name: Merge and upload coverage data
|
||||||
|
if: matrix.build_type == 'debug'
|
||||||
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
|
|
||||||
|
pg_regress-tests:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ build-neon ]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ debug, release ]
|
||||||
|
rust_toolchain: [ 1.58 ]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 2
|
||||||
|
|
||||||
|
- name: Pytest regress tests
|
||||||
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
with:
|
||||||
|
build_type: ${{ matrix.build_type }}
|
||||||
|
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||||
|
test_selection: batch_pg_regress
|
||||||
|
needs_postgres_source: true
|
||||||
|
|
||||||
|
- name: Merge and upload coverage data
|
||||||
|
if: matrix.build_type == 'debug'
|
||||||
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
|
other-tests:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ build-neon ]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ debug, release ]
|
||||||
|
rust_toolchain: [ 1.58 ]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 2
|
||||||
|
|
||||||
|
- name: Pytest other tests
|
||||||
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
with:
|
||||||
|
build_type: ${{ matrix.build_type }}
|
||||||
|
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||||
|
test_selection: batch_others
|
||||||
|
|
||||||
|
- name: Merge and upload coverage data
|
||||||
|
if: matrix.build_type == 'debug'
|
||||||
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
|
benchmarks:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ build-neon ]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ release ]
|
||||||
|
rust_toolchain: [ 1.58 ]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 2
|
||||||
|
|
||||||
|
- name: Pytest benchmarks
|
||||||
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
with:
|
||||||
|
build_type: ${{ matrix.build_type }}
|
||||||
|
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||||
|
test_selection: performance
|
||||||
|
run_in_parallel: false
|
||||||
|
save_perf_report: true
|
||||||
|
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||||
|
# while coverage is currently collected for the debug ones
|
||||||
|
|
||||||
|
coverage-report:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ other-tests, pg_regress-tests ]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ debug ]
|
||||||
|
rust_toolchain: [ 1.58 ]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Restore cargo deps cache
|
||||||
|
id: cache_cargo
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/.cargo/registry/
|
||||||
|
~/.cargo/git/
|
||||||
|
target/
|
||||||
|
key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||||
|
|
||||||
|
- name: Get Neon artifact for restoration
|
||||||
|
uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
|
||||||
|
path: ./neon-artifact/
|
||||||
|
|
||||||
|
- name: Extract Neon artifact
|
||||||
|
run: |
|
||||||
|
mkdir -p /tmp/neon/
|
||||||
|
tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
|
||||||
|
rm -rf ./neon-artifact/
|
||||||
|
|
||||||
|
- name: Restore coverage data
|
||||||
|
uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
name: coverage-data-artifact
|
||||||
|
path: /tmp/coverage/
|
||||||
|
|
||||||
|
- name: Merge coverage data
|
||||||
|
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
|
||||||
|
|
||||||
|
- name: Build and upload coverage report
|
||||||
|
run: |
|
||||||
|
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||||
|
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||||
|
COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA
|
||||||
|
|
||||||
|
scripts/coverage \
|
||||||
|
--dir=/tmp/coverage report \
|
||||||
|
--input-objects=/tmp/coverage/binaries.list \
|
||||||
|
--commit-url=$COMMIT_URL \
|
||||||
|
--format=github
|
||||||
|
|
||||||
|
REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA
|
||||||
|
|
||||||
|
scripts/git-upload \
|
||||||
|
--repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \
|
||||||
|
--message="Add code coverage for $COMMIT_URL" \
|
||||||
|
copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE
|
||||||
|
|
||||||
|
# Add link to the coverage report to the commit
|
||||||
|
curl -f -X POST \
|
||||||
|
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||||
|
-H "Accept: application/vnd.github.v3+json" \
|
||||||
|
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||||
|
--data \
|
||||||
|
"{
|
||||||
|
\"state\": \"success\",
|
||||||
|
\"context\": \"neon-coverage\",
|
||||||
|
\"description\": \"Coverage report is ready\",
|
||||||
|
\"target_url\": \"$REPORT_URL\"
|
||||||
|
}"
|
||||||
|
|
||||||
|
trigger-e2e-tests:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
needs: [ build-neon ]
|
||||||
|
steps:
|
||||||
|
- name: Set PR's status to pending and request a remote CI test
|
||||||
|
run: |
|
||||||
|
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||||
|
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||||
|
|
||||||
|
REMOTE_REPO="${{ github.repository_owner }}/cloud"
|
||||||
|
|
||||||
|
curl -f -X POST \
|
||||||
|
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||||
|
-H "Accept: application/vnd.github.v3+json" \
|
||||||
|
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||||
|
--data \
|
||||||
|
"{
|
||||||
|
\"state\": \"pending\",
|
||||||
|
\"context\": \"neon-cloud-e2e\",
|
||||||
|
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||||
|
}"
|
||||||
|
|
||||||
|
curl -f -X POST \
|
||||||
|
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
||||||
|
-H "Accept: application/vnd.github.v3+json" \
|
||||||
|
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||||
|
--data \
|
||||||
|
"{
|
||||||
|
\"ref\": \"main\",
|
||||||
|
\"inputs\": {
|
||||||
|
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||||
|
\"commit_hash\": \"$COMMIT_SHA\",
|
||||||
|
\"remote_repo\": \"${{ github.repository }}\"
|
||||||
|
}
|
||||||
|
}"
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
name: Build and Test
|
name: Check code style and build
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
@@ -6,15 +6,27 @@ on:
|
|||||||
- main
|
- main
|
||||||
pull_request:
|
pull_request:
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash -ex {0}
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
RUST_BACKTRACE: 1
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
regression-check:
|
check-codestyle-rust:
|
||||||
strategy:
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
# If we want to duplicate this job for different
|
# If we want to duplicate this job for different
|
||||||
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
|
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
|
||||||
rust_toolchain: [1.58]
|
rust_toolchain: [1.58]
|
||||||
os: [ubuntu-latest, macos-latest]
|
os: [ubuntu-latest, macos-latest]
|
||||||
timeout-minutes: 30
|
timeout-minutes: 50
|
||||||
name: run regression test suite
|
name: run regression test suite
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
@@ -92,5 +104,30 @@ jobs:
|
|||||||
- name: Run cargo clippy
|
- name: Run cargo clippy
|
||||||
run: ./run_clippy.sh
|
run: ./run_clippy.sh
|
||||||
|
|
||||||
- name: Run cargo test
|
- name: Ensure all project builds
|
||||||
run: cargo test --all --all-targets
|
run: cargo build --all --all-targets
|
||||||
|
|
||||||
|
check-codestyle-python:
|
||||||
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: false
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Cache poetry deps
|
||||||
|
id: cache_poetry
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: ~/.cache/pypoetry/virtualenvs
|
||||||
|
key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
|
||||||
|
|
||||||
|
- name: Install Python deps
|
||||||
|
run: ./scripts/pysync
|
||||||
|
|
||||||
|
- name: Run yapf to ensure code format
|
||||||
|
run: poetry run yapf --recursive --diff .
|
||||||
|
|
||||||
|
- name: Run mypy to check types
|
||||||
|
run: poetry run mypy .
|
||||||
74
.github/workflows/pg_clients.yml
vendored
Normal file
74
.github/workflows/pg_clients.yml
vendored
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
name: Test Postgres client libraries
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
# * is a special character in YAML so you have to quote this string
|
||||||
|
# ┌───────────── minute (0 - 59)
|
||||||
|
# │ ┌───────────── hour (0 - 23)
|
||||||
|
# │ │ ┌───────────── day of the month (1 - 31)
|
||||||
|
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||||
|
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||||
|
- cron: '23 02 * * *' # run once a day, timezone is utc
|
||||||
|
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test-postgres-client-libs:
|
||||||
|
runs-on: [ ubuntu-latest ]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: 3.9
|
||||||
|
|
||||||
|
- name: Install Poetry
|
||||||
|
uses: snok/install-poetry@v1
|
||||||
|
|
||||||
|
- name: Cache poetry deps
|
||||||
|
id: cache_poetry
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: ~/.cache/pypoetry/virtualenvs
|
||||||
|
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||||
|
|
||||||
|
- name: Install Python deps
|
||||||
|
shell: bash -ex {0}
|
||||||
|
run: ./scripts/pysync
|
||||||
|
|
||||||
|
- name: Run pytest
|
||||||
|
env:
|
||||||
|
REMOTE_ENV: 1
|
||||||
|
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||||
|
TEST_OUTPUT: /tmp/test_output
|
||||||
|
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||||
|
# this variable will be embedded in perf test report
|
||||||
|
# and is needed to distinguish different environments
|
||||||
|
PLATFORM: github-actions-selfhosted
|
||||||
|
shell: bash -ex {0}
|
||||||
|
run: |
|
||||||
|
# Test framework expects we have psql binary;
|
||||||
|
# but since we don't really need it in this test, let's mock it
|
||||||
|
mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql";
|
||||||
|
./scripts/pytest \
|
||||||
|
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||||
|
--tb=short \
|
||||||
|
--verbose \
|
||||||
|
-m "remote_cluster" \
|
||||||
|
-rA "test_runner/pg_clients"
|
||||||
|
|
||||||
|
- name: Post to a Slack channel
|
||||||
|
if: failure()
|
||||||
|
id: slack
|
||||||
|
uses: slackapi/slack-github-action@v1
|
||||||
|
with:
|
||||||
|
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||||
|
slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
|
env:
|
||||||
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -6,8 +6,8 @@ __pycache__/
|
|||||||
test_output/
|
test_output/
|
||||||
.vscode
|
.vscode
|
||||||
.idea
|
.idea
|
||||||
/.zenith
|
/.neon
|
||||||
/integration_tests/.zenith
|
/integration_tests/.neon
|
||||||
|
|
||||||
# Coverage
|
# Coverage
|
||||||
*.profraw
|
*.profraw
|
||||||
|
|||||||
@@ -6,5 +6,5 @@ target/
|
|||||||
tmp_install/
|
tmp_install/
|
||||||
__pycache__/
|
__pycache__/
|
||||||
test_output/
|
test_output/
|
||||||
.zenith/
|
.neon/
|
||||||
.git/
|
.git/
|
||||||
|
|||||||
121
Cargo.lock
generated
121
Cargo.lock
generated
@@ -64,6 +64,45 @@ dependencies = [
|
|||||||
"nodrop",
|
"nodrop",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "asn1-rs"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "30ff05a702273012438132f449575dbc804e27b2f3cbe3069aa237d26c98fa33"
|
||||||
|
dependencies = [
|
||||||
|
"asn1-rs-derive",
|
||||||
|
"asn1-rs-impl",
|
||||||
|
"displaydoc",
|
||||||
|
"nom",
|
||||||
|
"num-traits",
|
||||||
|
"rusticata-macros",
|
||||||
|
"thiserror",
|
||||||
|
"time 0.3.9",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "asn1-rs-derive"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "db8b7511298d5b7784b40b092d9e9dcd3a627a5707e4b5e507931ab0d44eeebf"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
"synstructure",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "asn1-rs-impl"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "async-stream"
|
name = "async-stream"
|
||||||
version = "0.3.3"
|
version = "0.3.3"
|
||||||
@@ -422,6 +461,7 @@ dependencies = [
|
|||||||
"tar",
|
"tar",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-postgres",
|
"tokio-postgres",
|
||||||
|
"url",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -712,6 +752,12 @@ dependencies = [
|
|||||||
"syn",
|
"syn",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "data-encoding"
|
||||||
|
version = "2.3.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3ee2393c4a91429dffb4bedf19f4d6abf27d8a732c8ce4980305d782e5426d57"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "debugid"
|
name = "debugid"
|
||||||
version = "0.7.3"
|
version = "0.7.3"
|
||||||
@@ -721,6 +767,20 @@ dependencies = [
|
|||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "der-parser"
|
||||||
|
version = "7.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fe398ac75057914d7d07307bf67dc7f3f574a26783b4fc7805a20ffa9f506e82"
|
||||||
|
dependencies = [
|
||||||
|
"asn1-rs",
|
||||||
|
"displaydoc",
|
||||||
|
"nom",
|
||||||
|
"num-bigint",
|
||||||
|
"num-traits",
|
||||||
|
"rusticata-macros",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "digest"
|
name = "digest"
|
||||||
version = "0.9.0"
|
version = "0.9.0"
|
||||||
@@ -762,6 +822,17 @@ dependencies = [
|
|||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "displaydoc"
|
||||||
|
version = "0.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "either"
|
name = "either"
|
||||||
version = "1.6.1"
|
version = "1.6.1"
|
||||||
@@ -1731,6 +1802,15 @@ dependencies = [
|
|||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "oid-registry"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "38e20717fa0541f39bd146692035c37bedfa532b3e5071b35761082407546b2a"
|
||||||
|
dependencies = [
|
||||||
|
"asn1-rs",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "once_cell"
|
name = "once_cell"
|
||||||
version = "1.10.0"
|
version = "1.10.0"
|
||||||
@@ -1842,6 +1922,7 @@ dependencies = [
|
|||||||
"tracing",
|
"tracing",
|
||||||
"url",
|
"url",
|
||||||
"utils",
|
"utils",
|
||||||
|
"walkdir",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -2249,6 +2330,7 @@ dependencies = [
|
|||||||
"url",
|
"url",
|
||||||
"utils",
|
"utils",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
|
"x509-parser",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -2620,6 +2702,15 @@ dependencies = [
|
|||||||
"semver",
|
"semver",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rusticata-macros"
|
||||||
|
version = "4.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632"
|
||||||
|
dependencies = [
|
||||||
|
"nom",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustls"
|
name = "rustls"
|
||||||
version = "0.20.4"
|
version = "0.20.4"
|
||||||
@@ -3059,6 +3150,18 @@ version = "0.1.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8"
|
checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "synstructure"
|
||||||
|
version = "0.12.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
"unicode-xid",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tar"
|
name = "tar"
|
||||||
version = "0.4.38"
|
version = "0.4.38"
|
||||||
@@ -3921,6 +4024,24 @@ dependencies = [
|
|||||||
"tracing-core",
|
"tracing-core",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "x509-parser"
|
||||||
|
version = "0.13.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9fb9bace5b5589ffead1afb76e43e34cff39cd0f3ce7e170ae0c29e53b88eb1c"
|
||||||
|
dependencies = [
|
||||||
|
"asn1-rs",
|
||||||
|
"base64",
|
||||||
|
"data-encoding",
|
||||||
|
"der-parser",
|
||||||
|
"lazy_static",
|
||||||
|
"nom",
|
||||||
|
"oid-registry",
|
||||||
|
"rusticata-macros",
|
||||||
|
"thiserror",
|
||||||
|
"time 0.3.9",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "xattr"
|
name = "xattr"
|
||||||
version = "0.2.2"
|
version = "0.2.2"
|
||||||
|
|||||||
10
Dockerfile
10
Dockerfile
@@ -1,5 +1,5 @@
|
|||||||
# Build Postgres
|
# Build Postgres
|
||||||
FROM zimg/rust:1.58 AS pg-build
|
FROM neondatabase/rust:1.58 AS pg-build
|
||||||
WORKDIR /pg
|
WORKDIR /pg
|
||||||
|
|
||||||
USER root
|
USER root
|
||||||
@@ -14,7 +14,7 @@ RUN set -e \
|
|||||||
&& tar -C tmp_install -czf /postgres_install.tar.gz .
|
&& tar -C tmp_install -czf /postgres_install.tar.gz .
|
||||||
|
|
||||||
# Build zenith binaries
|
# Build zenith binaries
|
||||||
FROM zimg/rust:1.58 AS build
|
FROM neondatabase/rust:1.58 AS build
|
||||||
ARG GIT_VERSION=local
|
ARG GIT_VERSION=local
|
||||||
|
|
||||||
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||||
@@ -46,9 +46,9 @@ RUN set -e \
|
|||||||
&& useradd -d /data zenith \
|
&& useradd -d /data zenith \
|
||||||
&& chown -R zenith:zenith /data
|
&& chown -R zenith:zenith /data
|
||||||
|
|
||||||
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin
|
COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin
|
||||||
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin
|
COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin
|
||||||
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy /usr/local/bin
|
COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy /usr/local/bin
|
||||||
|
|
||||||
COPY --from=pg-build /pg/tmp_install/ /usr/local/
|
COPY --from=pg-build /pg/tmp_install/ /usr/local/
|
||||||
COPY --from=pg-build /postgres_install.tar.gz /data/
|
COPY --from=pg-build /postgres_install.tar.gz /data/
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# First transient image to build compute_tools binaries
|
# First transient image to build compute_tools binaries
|
||||||
# NB: keep in sync with rust image version in .circle/config.yml
|
# NB: keep in sync with rust image version in .circle/config.yml
|
||||||
FROM zimg/rust:1.58 AS rust-build
|
FROM neondatabase/rust:1.58 AS rust-build
|
||||||
|
|
||||||
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||||
ARG AWS_ACCESS_KEY_ID
|
ARG AWS_ACCESS_KEY_ID
|
||||||
@@ -15,4 +15,4 @@ RUN set -e \
|
|||||||
# Final image that only has one binary
|
# Final image that only has one binary
|
||||||
FROM debian:buster-slim
|
FROM debian:buster-slim
|
||||||
|
|
||||||
COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl
|
COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl
|
||||||
|
|||||||
40
README.md
40
README.md
@@ -29,7 +29,7 @@ Pageserver consists of:
|
|||||||
## Running local installation
|
## Running local installation
|
||||||
|
|
||||||
|
|
||||||
#### building on Linux
|
#### Installing dependencies on Linux
|
||||||
1. Install build dependencies and other useful packages
|
1. Install build dependencies and other useful packages
|
||||||
|
|
||||||
* On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
* On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||||
@@ -49,18 +49,11 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
|
|||||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Build neon and patched postgres
|
#### Installing dependencies on OSX (12.3.1)
|
||||||
```sh
|
|
||||||
git clone --recursive https://github.com/neondatabase/neon.git
|
|
||||||
cd neon
|
|
||||||
make -j`nproc`
|
|
||||||
```
|
|
||||||
|
|
||||||
#### building on OSX (12.3.1)
|
|
||||||
1. Install XCode and dependencies
|
1. Install XCode and dependencies
|
||||||
```
|
```
|
||||||
xcode-select --install
|
xcode-select --install
|
||||||
brew install protobuf etcd
|
brew install protobuf etcd openssl
|
||||||
```
|
```
|
||||||
|
|
||||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||||
@@ -76,11 +69,20 @@ brew install libpq
|
|||||||
brew link --force libpq
|
brew link --force libpq
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Build neon and patched postgres
|
#### Building on Linux and OSX
|
||||||
```sh
|
|
||||||
|
1. Build neon and patched postgres
|
||||||
|
```
|
||||||
|
# Note: The path to the neon sources can not contain a space.
|
||||||
|
|
||||||
git clone --recursive https://github.com/neondatabase/neon.git
|
git clone --recursive https://github.com/neondatabase/neon.git
|
||||||
cd neon
|
cd neon
|
||||||
make -j5
|
|
||||||
|
# The preferred and default is to make a debug build. This will create a
|
||||||
|
# demonstrably slower build than a release build. If you want to use a release
|
||||||
|
# build, utilize "`BUILD_TYPE=release make -j`nproc``"
|
||||||
|
|
||||||
|
make -j`nproc`
|
||||||
```
|
```
|
||||||
|
|
||||||
#### dependency installation notes
|
#### dependency installation notes
|
||||||
@@ -93,7 +95,7 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
|
|||||||
#### running neon database
|
#### running neon database
|
||||||
1. Start pageserver and postgres on top of it (should be called from repo root):
|
1. Start pageserver and postgres on top of it (should be called from repo root):
|
||||||
```sh
|
```sh
|
||||||
# Create repository in .zenith with proper paths to binaries and data
|
# Create repository in .neon with proper paths to binaries and data
|
||||||
# Later that would be responsibility of a package install script
|
# Later that would be responsibility of a package install script
|
||||||
> ./target/debug/neon_local init
|
> ./target/debug/neon_local init
|
||||||
initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
|
initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
|
||||||
@@ -103,16 +105,16 @@ pageserver init succeeded
|
|||||||
|
|
||||||
# start pageserver and safekeeper
|
# start pageserver and safekeeper
|
||||||
> ./target/debug/neon_local start
|
> ./target/debug/neon_local start
|
||||||
Starting pageserver at '127.0.0.1:64000' in '.zenith'
|
Starting pageserver at '127.0.0.1:64000' in '.neon'
|
||||||
Pageserver started
|
Pageserver started
|
||||||
initializing for sk 1 for 7676
|
initializing for sk 1 for 7676
|
||||||
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1'
|
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'
|
||||||
Safekeeper started
|
Safekeeper started
|
||||||
|
|
||||||
# start postgres compute node
|
# start postgres compute node
|
||||||
> ./target/debug/neon_local pg start main
|
> ./target/debug/neon_local pg start main
|
||||||
Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
|
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
|
||||||
Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
|
Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
|
||||||
|
|
||||||
# check list of running postgres instances
|
# check list of running postgres instances
|
||||||
@@ -149,7 +151,7 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
|
|||||||
# start postgres on that branch
|
# start postgres on that branch
|
||||||
> ./target/debug/neon_local pg start migration_check --branch-name migration_check
|
> ./target/debug/neon_local pg start migration_check --branch-name migration_check
|
||||||
Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
||||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
|
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
|
||||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
|
Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
|
||||||
|
|
||||||
# check the new list of running postgres instances
|
# check the new list of running postgres instances
|
||||||
@@ -209,7 +211,7 @@ Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, wh
|
|||||||
To get more familiar with this aspect, refer to:
|
To get more familiar with this aspect, refer to:
|
||||||
|
|
||||||
- [Neon glossary](/docs/glossary.md)
|
- [Neon glossary](/docs/glossary.md)
|
||||||
- [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html)
|
- [PostgreSQL glossary](https://www.postgresql.org/docs/14/glossary.html)
|
||||||
- Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres))
|
- Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres))
|
||||||
|
|
||||||
## Join the development
|
## Join the development
|
||||||
|
|||||||
@@ -18,4 +18,5 @@ serde_json = "1"
|
|||||||
tar = "0.4"
|
tar = "0.4"
|
||||||
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
|
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
|
||||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||||
|
url = "2.2.2"
|
||||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ use std::process::exit;
|
|||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use std::{thread, time::Duration};
|
use std::{thread, time::Duration};
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::{Context, Result};
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use clap::Arg;
|
use clap::Arg;
|
||||||
use log::{error, info};
|
use log::{error, info};
|
||||||
@@ -45,6 +45,7 @@ use compute_tools::monitor::launch_monitor;
|
|||||||
use compute_tools::params::*;
|
use compute_tools::params::*;
|
||||||
use compute_tools::pg_helpers::*;
|
use compute_tools::pg_helpers::*;
|
||||||
use compute_tools::spec::*;
|
use compute_tools::spec::*;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
// TODO: re-use `utils::logging` later
|
// TODO: re-use `utils::logging` later
|
||||||
@@ -131,7 +132,7 @@ fn main() -> Result<()> {
|
|||||||
|
|
||||||
let compute_state = ComputeNode {
|
let compute_state = ComputeNode {
|
||||||
start_time: Utc::now(),
|
start_time: Utc::now(),
|
||||||
connstr: connstr.to_string(),
|
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
|
||||||
pgdata: pgdata.to_string(),
|
pgdata: pgdata.to_string(),
|
||||||
pgbin: pgbin.to_string(),
|
pgbin: pgbin.to_string(),
|
||||||
spec,
|
spec,
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Result};
|
||||||
use log::error;
|
use log::error;
|
||||||
use postgres::Client;
|
use postgres::Client;
|
||||||
@@ -23,9 +21,8 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn check_writability(compute: &Arc<ComputeNode>) -> Result<()> {
|
pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
|
||||||
let connstr = &compute.connstr;
|
let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
|
||||||
let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?;
|
|
||||||
if client.is_closed() {
|
if client.is_closed() {
|
||||||
return Err(anyhow!("connection to postgres closed"));
|
return Err(anyhow!("connection to postgres closed"));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -35,7 +35,8 @@ use crate::spec::*;
|
|||||||
/// Compute node info shared across several `compute_ctl` threads.
|
/// Compute node info shared across several `compute_ctl` threads.
|
||||||
pub struct ComputeNode {
|
pub struct ComputeNode {
|
||||||
pub start_time: DateTime<Utc>,
|
pub start_time: DateTime<Utc>,
|
||||||
pub connstr: String,
|
// Url type maintains proper escaping
|
||||||
|
pub connstr: url::Url,
|
||||||
pub pgdata: String,
|
pub pgdata: String,
|
||||||
pub pgbin: String,
|
pub pgbin: String,
|
||||||
pub spec: ComputeSpec,
|
pub spec: ComputeSpec,
|
||||||
@@ -268,27 +269,32 @@ impl ComputeNode {
|
|||||||
// In this case we need to connect with old `zenith_admin`name
|
// In this case we need to connect with old `zenith_admin`name
|
||||||
// and create new user. We cannot simply rename connected user,
|
// and create new user. We cannot simply rename connected user,
|
||||||
// but we can create a new one and grant it all privileges.
|
// but we can create a new one and grant it all privileges.
|
||||||
let mut client = match Client::connect(&self.connstr, NoTls) {
|
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
info!(
|
info!(
|
||||||
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
||||||
e
|
e
|
||||||
);
|
);
|
||||||
let zenith_admin_connstr = self.connstr.replacen("cloud_admin", "zenith_admin", 1);
|
let mut zenith_admin_connstr = self.connstr.clone();
|
||||||
|
|
||||||
let mut client = Client::connect(&zenith_admin_connstr, NoTls)?;
|
zenith_admin_connstr
|
||||||
|
.set_username("zenith_admin")
|
||||||
|
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
|
||||||
|
|
||||||
|
let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
|
||||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||||
drop(client);
|
drop(client);
|
||||||
|
|
||||||
// reconnect with connsting with expected name
|
// reconnect with connsting with expected name
|
||||||
Client::connect(&self.connstr, NoTls)?
|
Client::connect(self.connstr.as_str(), NoTls)?
|
||||||
}
|
}
|
||||||
Ok(client) => client,
|
Ok(client) => client,
|
||||||
};
|
};
|
||||||
|
|
||||||
handle_roles(&self.spec, &mut client)?;
|
handle_roles(&self.spec, &mut client)?;
|
||||||
handle_databases(&self.spec, &mut client)?;
|
handle_databases(&self.spec, &mut client)?;
|
||||||
|
handle_role_deletions(self, &mut client)?;
|
||||||
handle_grants(&self.spec, &mut client)?;
|
handle_grants(&self.spec, &mut client)?;
|
||||||
create_writablity_check_data(&mut client)?;
|
create_writablity_check_data(&mut client)?;
|
||||||
|
|
||||||
|
|||||||
@@ -13,11 +13,11 @@ const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
|
|||||||
// Spin in a loop and figure out the last activity time in the Postgres.
|
// Spin in a loop and figure out the last activity time in the Postgres.
|
||||||
// Then update it in the shared state. This function never errors out.
|
// Then update it in the shared state. This function never errors out.
|
||||||
// XXX: the only expected panic is at `RwLock` unwrap().
|
// XXX: the only expected panic is at `RwLock` unwrap().
|
||||||
fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
fn watch_compute_activity(compute: &ComputeNode) {
|
||||||
// Suppose that `connstr` doesn't change
|
// Suppose that `connstr` doesn't change
|
||||||
let connstr = compute.connstr.clone();
|
let connstr = compute.connstr.as_str();
|
||||||
// Define `client` outside of the loop to reuse existing connection if it's active.
|
// Define `client` outside of the loop to reuse existing connection if it's active.
|
||||||
let mut client = Client::connect(&connstr, NoTls);
|
let mut client = Client::connect(connstr, NoTls);
|
||||||
let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
|
let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
|
||||||
|
|
||||||
info!("watching Postgres activity at {}", connstr);
|
info!("watching Postgres activity at {}", connstr);
|
||||||
@@ -32,7 +32,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
|||||||
info!("connection to postgres closed, trying to reconnect");
|
info!("connection to postgres closed, trying to reconnect");
|
||||||
|
|
||||||
// Connection is closed, reconnect and try again.
|
// Connection is closed, reconnect and try again.
|
||||||
client = Client::connect(&connstr, NoTls);
|
client = Client::connect(connstr, NoTls);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -93,7 +93,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
|||||||
debug!("cannot connect to postgres: {}, retrying", e);
|
debug!("cannot connect to postgres: {}, retrying", e);
|
||||||
|
|
||||||
// Establish a new connection and try again.
|
// Establish a new connection and try again.
|
||||||
client = Client::connect(&connstr, NoTls);
|
client = Client::connect(connstr, NoTls);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use std::fmt::Write;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{BufRead, BufReader};
|
use std::io::{BufRead, BufReader};
|
||||||
use std::net::{SocketAddr, TcpStream};
|
use std::net::{SocketAddr, TcpStream};
|
||||||
@@ -138,9 +139,11 @@ impl Role {
|
|||||||
// Now we also support SCRAM-SHA-256 and to preserve compatibility
|
// Now we also support SCRAM-SHA-256 and to preserve compatibility
|
||||||
// we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
|
// we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
|
||||||
if pass.starts_with("SCRAM-SHA-256") {
|
if pass.starts_with("SCRAM-SHA-256") {
|
||||||
params.push_str(&format!(" PASSWORD '{}'", pass));
|
write!(params, " PASSWORD '{pass}'")
|
||||||
|
.expect("String is documented to not to error during write operations");
|
||||||
} else {
|
} else {
|
||||||
params.push_str(&format!(" PASSWORD 'md5{}'", pass));
|
write!(params, " PASSWORD 'md5{pass}'")
|
||||||
|
.expect("String is documented to not to error during write operations");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
params.push_str(" PASSWORD NULL");
|
params.push_str(" PASSWORD NULL");
|
||||||
@@ -158,7 +161,8 @@ impl Database {
|
|||||||
/// it may require a proper quoting too.
|
/// it may require a proper quoting too.
|
||||||
pub fn to_pg_options(&self) -> String {
|
pub fn to_pg_options(&self) -> String {
|
||||||
let mut params: String = self.options.as_pg_options();
|
let mut params: String = self.options.as_pg_options();
|
||||||
params.push_str(&format!(" OWNER {}", &self.owner.quote()));
|
write!(params, " OWNER {}", &self.owner.quote())
|
||||||
|
.expect("String is documented to not to error during write operations");
|
||||||
|
|
||||||
params
|
params
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,9 +2,10 @@ use std::path::Path;
|
|||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use log::{info, log_enabled, warn, Level};
|
use log::{info, log_enabled, warn, Level};
|
||||||
use postgres::Client;
|
use postgres::{Client, NoTls};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
|
|
||||||
|
use crate::compute::ComputeNode;
|
||||||
use crate::config;
|
use crate::config;
|
||||||
use crate::params::PG_HBA_ALL_MD5;
|
use crate::params::PG_HBA_ALL_MD5;
|
||||||
use crate::pg_helpers::*;
|
use crate::pg_helpers::*;
|
||||||
@@ -97,18 +98,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
|
|
||||||
// Process delta operations first
|
// Process delta operations first
|
||||||
if let Some(ops) = &spec.delta_operations {
|
if let Some(ops) = &spec.delta_operations {
|
||||||
info!("processing delta operations on roles");
|
info!("processing role renames");
|
||||||
for op in ops {
|
for op in ops {
|
||||||
match op.action.as_ref() {
|
match op.action.as_ref() {
|
||||||
// We do not check either role exists or not,
|
|
||||||
// Postgres will take care of it for us
|
|
||||||
"delete_role" => {
|
"delete_role" => {
|
||||||
let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
|
// no-op now, roles will be deleted at the end of configuration
|
||||||
|
|
||||||
warn!("deleting role '{}'", &op.name);
|
|
||||||
xact.execute(query.as_str(), &[])?;
|
|
||||||
}
|
}
|
||||||
// Renaming role drops its password, since tole name is
|
// Renaming role drops its password, since role name is
|
||||||
// used as a salt there. It is important that this role
|
// used as a salt there. It is important that this role
|
||||||
// is recorded with a new `name` in the `roles` list.
|
// is recorded with a new `name` in the `roles` list.
|
||||||
// Follow up roles update will set the new password.
|
// Follow up roles update will set the new password.
|
||||||
@@ -182,7 +178,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
xact.execute(query.as_str(), &[])?;
|
xact.execute(query.as_str(), &[])?;
|
||||||
|
|
||||||
let grant_query = format!(
|
let grant_query = format!(
|
||||||
"grant pg_read_all_data, pg_write_all_data to {}",
|
"GRANT pg_read_all_data, pg_write_all_data TO {}",
|
||||||
name.quote()
|
name.quote()
|
||||||
);
|
);
|
||||||
xact.execute(grant_query.as_str(), &[])?;
|
xact.execute(grant_query.as_str(), &[])?;
|
||||||
@@ -197,6 +193,70 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reassign all dependent objects and delete requested roles.
|
||||||
|
pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||||
|
let spec = &node.spec;
|
||||||
|
|
||||||
|
// First, reassign all dependent objects to db owners.
|
||||||
|
if let Some(ops) = &spec.delta_operations {
|
||||||
|
info!("reassigning dependent objects of to-be-deleted roles");
|
||||||
|
for op in ops {
|
||||||
|
if op.action == "delete_role" {
|
||||||
|
reassign_owned_objects(node, &op.name)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second, proceed with role deletions.
|
||||||
|
let mut xact = client.transaction()?;
|
||||||
|
if let Some(ops) = &spec.delta_operations {
|
||||||
|
info!("processing role deletions");
|
||||||
|
for op in ops {
|
||||||
|
// We do not check either role exists or not,
|
||||||
|
// Postgres will take care of it for us
|
||||||
|
if op.action == "delete_role" {
|
||||||
|
let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
|
||||||
|
|
||||||
|
warn!("deleting role '{}'", &op.name);
|
||||||
|
xact.execute(query.as_str(), &[])?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reassign all owned objects in all databases to the owner of the database.
|
||||||
|
fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
|
||||||
|
for db in &node.spec.cluster.databases {
|
||||||
|
if db.owner != *role_name {
|
||||||
|
let mut connstr = node.connstr.clone();
|
||||||
|
// database name is always the last and the only component of the path
|
||||||
|
connstr.set_path(&db.name);
|
||||||
|
|
||||||
|
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||||
|
|
||||||
|
// This will reassign all dependent objects to the db owner
|
||||||
|
let reassign_query = format!(
|
||||||
|
"REASSIGN OWNED BY {} TO {}",
|
||||||
|
role_name.quote(),
|
||||||
|
db.owner.quote()
|
||||||
|
);
|
||||||
|
info!(
|
||||||
|
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||||
|
role_name, &db.name, &db.owner
|
||||||
|
);
|
||||||
|
client.simple_query(&reassign_query)?;
|
||||||
|
|
||||||
|
// This now will only drop privileges of the role
|
||||||
|
let drop_query = format!("DROP OWNED BY {}", role_name.quote());
|
||||||
|
client.simple_query(&drop_query)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// It follows mostly the same logic as `handle_roles()` excepting that we
|
/// It follows mostly the same logic as `handle_roles()` excepting that we
|
||||||
/// does not use an explicit transactions block, since major database operations
|
/// does not use an explicit transactions block, since major database operations
|
||||||
/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
|
/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
|
||||||
@@ -294,13 +354,26 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||||
info!("cluster spec grants:");
|
info!("cluster spec grants:");
|
||||||
|
|
||||||
|
// We now have a separate `web_access` role to connect to the database
|
||||||
|
// via the web interface and proxy link auth. And also we grant a
|
||||||
|
// read / write all data privilege to every role. So also grant
|
||||||
|
// create to everyone.
|
||||||
|
// XXX: later we should stop messing with Postgres ACL in such horrible
|
||||||
|
// ways.
|
||||||
|
let roles = spec
|
||||||
|
.cluster
|
||||||
|
.roles
|
||||||
|
.iter()
|
||||||
|
.map(|r| r.name.quote())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
for db in &spec.cluster.databases {
|
for db in &spec.cluster.databases {
|
||||||
let dbname = &db.name;
|
let dbname = &db.name;
|
||||||
|
|
||||||
let query: String = format!(
|
let query: String = format!(
|
||||||
"GRANT CREATE ON DATABASE {} TO {}",
|
"GRANT CREATE ON DATABASE {} TO {}",
|
||||||
dbname.quote(),
|
dbname.quote(),
|
||||||
db.owner.quote()
|
roles.join(", ")
|
||||||
);
|
);
|
||||||
info!("grant query {}", &query);
|
info!("grant query {}", &query);
|
||||||
|
|
||||||
|
|||||||
@@ -21,9 +21,9 @@ use utils::{
|
|||||||
use crate::safekeeper::SafekeeperNode;
|
use crate::safekeeper::SafekeeperNode;
|
||||||
|
|
||||||
//
|
//
|
||||||
// This data structures represents zenith CLI config
|
// This data structures represents neon_local CLI config
|
||||||
//
|
//
|
||||||
// It is deserialized from the .zenith/config file, or the config file passed
|
// It is deserialized from the .neon/config file, or the config file passed
|
||||||
// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
|
// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
|
||||||
// an example.
|
// an example.
|
||||||
//
|
//
|
||||||
@@ -34,8 +34,8 @@ pub struct LocalEnv {
|
|||||||
// compute nodes).
|
// compute nodes).
|
||||||
//
|
//
|
||||||
// This is not stored in the config file. Rather, this is the path where the
|
// This is not stored in the config file. Rather, this is the path where the
|
||||||
// config file itself is. It is read from the ZENITH_REPO_DIR env variable or
|
// config file itself is. It is read from the NEON_REPO_DIR env variable or
|
||||||
// '.zenith' if not given.
|
// '.neon' if not given.
|
||||||
#[serde(skip)]
|
#[serde(skip)]
|
||||||
pub base_data_dir: PathBuf,
|
pub base_data_dir: PathBuf,
|
||||||
|
|
||||||
@@ -177,6 +177,7 @@ pub struct SafekeeperConf {
|
|||||||
pub sync: bool,
|
pub sync: bool,
|
||||||
pub remote_storage: Option<String>,
|
pub remote_storage: Option<String>,
|
||||||
pub backup_threads: Option<u32>,
|
pub backup_threads: Option<u32>,
|
||||||
|
pub auth_enabled: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for SafekeeperConf {
|
impl Default for SafekeeperConf {
|
||||||
@@ -188,6 +189,7 @@ impl Default for SafekeeperConf {
|
|||||||
sync: true,
|
sync: true,
|
||||||
remote_storage: None,
|
remote_storage: None,
|
||||||
backup_threads: None,
|
backup_threads: None,
|
||||||
|
auth_enabled: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -337,7 +339,7 @@ impl LocalEnv {
|
|||||||
pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
|
pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
|
||||||
// Currently, the user first passes a config file with 'zenith init --config=<path>'
|
// Currently, the user first passes a config file with 'zenith init --config=<path>'
|
||||||
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
|
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
|
||||||
// to .zenith/config. TODO: We lose any formatting and comments along the way, which is
|
// to .neon/config. TODO: We lose any formatting and comments along the way, which is
|
||||||
// a bit sad.
|
// a bit sad.
|
||||||
let mut conf_content = r#"# This file describes a locale deployment of the page server
|
let mut conf_content = r#"# This file describes a locale deployment of the page server
|
||||||
# and safekeeeper node. It is read by the 'zenith' command-line
|
# and safekeeeper node. It is read by the 'zenith' command-line
|
||||||
@@ -401,16 +403,6 @@ impl LocalEnv {
|
|||||||
self.pg_distrib_dir.display()
|
self.pg_distrib_dir.display()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
for binary in ["pageserver", "safekeeper"] {
|
|
||||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
|
||||||
bail!(
|
|
||||||
"Can't find binary '{}' in zenith distrib dir '{}'",
|
|
||||||
binary,
|
|
||||||
self.zenith_distrib_dir.display()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for binary in ["pageserver", "safekeeper"] {
|
for binary in ["pageserver", "safekeeper"] {
|
||||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
if !self.zenith_distrib_dir.join(binary).exists() {
|
||||||
bail!(
|
bail!(
|
||||||
@@ -419,12 +411,6 @@ impl LocalEnv {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !self.pg_distrib_dir.join("bin/postgres").exists() {
|
|
||||||
bail!(
|
|
||||||
"Can't find postgres binary at {}",
|
|
||||||
self.pg_distrib_dir.display()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
fs::create_dir(&base_path)?;
|
fs::create_dir(&base_path)?;
|
||||||
|
|
||||||
@@ -481,9 +467,9 @@ impl LocalEnv {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn base_path() -> PathBuf {
|
fn base_path() -> PathBuf {
|
||||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
match std::env::var_os("NEON_REPO_DIR") {
|
||||||
Some(val) => PathBuf::from(val),
|
Some(val) => PathBuf::from(val),
|
||||||
None => PathBuf::from(".zenith"),
|
None => PathBuf::from(".neon"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -149,6 +149,11 @@ impl SafekeeperNode {
|
|||||||
if let Some(ref remote_storage) = self.conf.remote_storage {
|
if let Some(ref remote_storage) = self.conf.remote_storage {
|
||||||
cmd.args(&["--remote-storage", remote_storage]);
|
cmd.args(&["--remote-storage", remote_storage]);
|
||||||
}
|
}
|
||||||
|
if self.conf.auth_enabled {
|
||||||
|
cmd.arg("--auth-validation-public-key-path");
|
||||||
|
// PathBuf is better be passed as is, not via `String`.
|
||||||
|
cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
|
||||||
|
}
|
||||||
|
|
||||||
fill_aws_secrets_vars(&mut cmd);
|
fill_aws_secrets_vars(&mut cmd);
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::io::Write;
|
use std::fs::File;
|
||||||
|
use std::io::{BufReader, Write};
|
||||||
use std::net::TcpStream;
|
use std::net::TcpStream;
|
||||||
use std::num::NonZeroU64;
|
use std::num::NonZeroU64;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
@@ -527,4 +528,54 @@ impl PageServerNode {
|
|||||||
|
|
||||||
Ok(timeline_info_response)
|
Ok(timeline_info_response)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Import a basebackup prepared using either:
|
||||||
|
/// a) `pg_basebackup -F tar`, or
|
||||||
|
/// b) The `fullbackup` pageserver endpoint
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `tenant_id` - tenant to import into. Created if not exists
|
||||||
|
/// * `timeline_id` - id to assign to imported timeline
|
||||||
|
/// * `base` - (start lsn of basebackup, path to `base.tar` file)
|
||||||
|
/// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
|
||||||
|
pub fn timeline_import(
|
||||||
|
&self,
|
||||||
|
tenant_id: ZTenantId,
|
||||||
|
timeline_id: ZTimelineId,
|
||||||
|
base: (Lsn, PathBuf),
|
||||||
|
pg_wal: Option<(Lsn, PathBuf)>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
|
||||||
|
|
||||||
|
// Init base reader
|
||||||
|
let (start_lsn, base_tarfile_path) = base;
|
||||||
|
let base_tarfile = File::open(base_tarfile_path)?;
|
||||||
|
let mut base_reader = BufReader::new(base_tarfile);
|
||||||
|
|
||||||
|
// Init wal reader if necessary
|
||||||
|
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
|
||||||
|
let wal_tarfile = File::open(wal_tarfile_path)?;
|
||||||
|
let wal_reader = BufReader::new(wal_tarfile);
|
||||||
|
(end_lsn, Some(wal_reader))
|
||||||
|
} else {
|
||||||
|
(start_lsn, None)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Import base
|
||||||
|
let import_cmd =
|
||||||
|
format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
|
||||||
|
let mut writer = client.copy_in(&import_cmd)?;
|
||||||
|
io::copy(&mut base_reader, &mut writer)?;
|
||||||
|
writer.finish()?;
|
||||||
|
|
||||||
|
// Import wal if necessary
|
||||||
|
if let Some(mut wal_reader) = wal_reader {
|
||||||
|
let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
|
||||||
|
let mut writer = client.copy_in(&import_cmd)?;
|
||||||
|
io::copy(&mut wal_reader, &mut writer)?;
|
||||||
|
writer.finish()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,12 +36,12 @@ This is how the `LOGICAL_TIMELINE_SIZE` metric is implemented in the pageserver.
|
|||||||
Alternatively, we could count only relation data. As in pg_database_size().
|
Alternatively, we could count only relation data. As in pg_database_size().
|
||||||
This approach is somewhat more user-friendly because it is the data that is really affected by the user.
|
This approach is somewhat more user-friendly because it is the data that is really affected by the user.
|
||||||
On the other hand, it puts us in a weaker position than other services, i.e., RDS.
|
On the other hand, it puts us in a weaker position than other services, i.e., RDS.
|
||||||
We will need to refactor the timeline_size counter or add another counter to implement it.
|
We will need to refactor the timeline_size counter or add another counter to implement it.
|
||||||
|
|
||||||
Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment.
|
Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment.
|
||||||
Then this size should be reported to compute node.
|
Then this size should be reported to compute node.
|
||||||
|
|
||||||
`current_timeline_size` value is included in the walreceiver's custom feedback message: `ZenithFeedback.`
|
`current_timeline_size` value is included in the walreceiver's custom feedback message: `ReplicationFeedback.`
|
||||||
|
|
||||||
(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037).
|
(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037).
|
||||||
|
|
||||||
@@ -64,11 +64,11 @@ We should warn users if the limit is soon to be reached.
|
|||||||
### **Reliability, failure modes and corner cases**
|
### **Reliability, failure modes and corner cases**
|
||||||
|
|
||||||
1. `current_timeline_size` is valid at the last received and digested by pageserver lsn.
|
1. `current_timeline_size` is valid at the last received and digested by pageserver lsn.
|
||||||
|
|
||||||
If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time.
|
If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time.
|
||||||
|
|
||||||
So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this?
|
So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this?
|
||||||
|
|
||||||
Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue.
|
Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -154,7 +154,7 @@ The default distrib dir is `./tmp_install/`.
|
|||||||
#### workdir (-D)
|
#### workdir (-D)
|
||||||
|
|
||||||
A directory in the file system, where pageserver will store its files.
|
A directory in the file system, where pageserver will store its files.
|
||||||
The default is `./.zenith/`.
|
The default is `./.neon/`.
|
||||||
|
|
||||||
This parameter has a special CLI alias (`-D`) and can not be overridden with regular `-c` way.
|
This parameter has a special CLI alias (`-D`) and can not be overridden with regular `-c` way.
|
||||||
|
|
||||||
|
|||||||
@@ -1,62 +1,81 @@
|
|||||||
//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
|
//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
|
||||||
//! Intended to connect services to each other, not to store their data.
|
//! Intended to connect services to each other, not to store their data.
|
||||||
use std::{
|
|
||||||
collections::{hash_map, HashMap},
|
|
||||||
fmt::Display,
|
|
||||||
str::FromStr,
|
|
||||||
};
|
|
||||||
|
|
||||||
use once_cell::sync::Lazy;
|
/// All broker keys, that are used when dealing with etcd.
|
||||||
use regex::{Captures, Regex};
|
pub mod subscription_key;
|
||||||
use serde::{Deserialize, Serialize};
|
/// All broker values, possible to use when dealing with etcd.
|
||||||
use serde_with::{serde_as, DisplayFromStr};
|
pub mod subscription_value;
|
||||||
|
|
||||||
pub use etcd_client::*;
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use serde::de::DeserializeOwned;
|
||||||
|
|
||||||
|
use subscription_key::SubscriptionKey;
|
||||||
use tokio::{sync::mpsc, task::JoinHandle};
|
use tokio::{sync::mpsc, task::JoinHandle};
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::{
|
|
||||||
lsn::Lsn,
|
use crate::subscription_key::SubscriptionFullKey;
|
||||||
zid::{NodeId, ZTenantId, ZTenantTimelineId},
|
|
||||||
};
|
pub use etcd_client::*;
|
||||||
|
|
||||||
/// Default value to use for prefixing to all etcd keys with.
|
/// Default value to use for prefixing to all etcd keys with.
|
||||||
/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster.
|
/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster.
|
||||||
pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";
|
pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Serialize)]
|
/// A way to control the data retrieval from a certain subscription.
|
||||||
struct SafekeeperTimeline {
|
pub struct BrokerSubscription<V> {
|
||||||
safekeeper_id: NodeId,
|
/// An unbounded channel to fetch the relevant etcd updates from.
|
||||||
info: SkTimelineInfo,
|
pub value_updates: mpsc::UnboundedReceiver<BrokerUpdate<V>>,
|
||||||
|
key: SubscriptionKey,
|
||||||
|
/// A subscription task handle, to allow waiting on it for the task to complete.
|
||||||
|
/// Both the updates channel and the handle require `&mut`, so it's better to keep
|
||||||
|
/// both `pub` to allow using both in the same structures without borrow checker complaining.
|
||||||
|
pub watcher_handle: JoinHandle<Result<(), BrokerError>>,
|
||||||
|
watcher: Watcher,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Published data about safekeeper's timeline. Fields made optional for easy migrations.
|
impl<V> BrokerSubscription<V> {
|
||||||
#[serde_as]
|
/// Cancels the subscription, stopping the data poller and waiting for it to shut down.
|
||||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
pub async fn cancel(mut self) -> Result<(), BrokerError> {
|
||||||
pub struct SkTimelineInfo {
|
self.watcher.cancel().await.map_err(|e| {
|
||||||
/// Term of the last entry.
|
BrokerError::EtcdClient(
|
||||||
pub last_log_term: Option<u64>,
|
e,
|
||||||
/// LSN of the last record.
|
format!("Failed to cancel broker subscription, kind: {:?}", self.key),
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
)
|
||||||
#[serde(default)]
|
})?;
|
||||||
pub flush_lsn: Option<Lsn>,
|
match (&mut self.watcher_handle).await {
|
||||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
Ok(res) => res,
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
Err(e) => {
|
||||||
#[serde(default)]
|
if e.is_cancelled() {
|
||||||
pub commit_lsn: Option<Lsn>,
|
// don't error on the tasks that are cancelled already
|
||||||
/// LSN up to which safekeeper has backed WAL.
|
Ok(())
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
} else {
|
||||||
#[serde(default)]
|
Err(BrokerError::InternalError(format!(
|
||||||
pub backup_lsn: Option<Lsn>,
|
"Panicked during broker subscription task, kind: {:?}, error: {e}",
|
||||||
/// LSN of last checkpoint uploaded by pageserver.
|
self.key
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
)))
|
||||||
#[serde(default)]
|
}
|
||||||
pub remote_consistent_lsn: Option<Lsn>,
|
}
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
}
|
||||||
#[serde(default)]
|
}
|
||||||
pub peer_horizon_lsn: Option<Lsn>,
|
}
|
||||||
#[serde(default)]
|
|
||||||
pub safekeeper_connstr: Option<String>,
|
impl<V> Drop for BrokerSubscription<V> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
// we poll data from etcd into the channel in the same struct, so if the whole struct gets dropped,
|
||||||
|
// no more data is used by the receiver and it's safe to cancel and drop the whole etcd subscription task.
|
||||||
|
self.watcher_handle.abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An update from the etcd broker.
|
||||||
|
pub struct BrokerUpdate<V> {
|
||||||
|
/// Etcd generation version, the bigger the more actual the data is.
|
||||||
|
pub etcd_version: i64,
|
||||||
|
/// Etcd key for the corresponding value, parsed from the broker KV.
|
||||||
|
pub key: SubscriptionFullKey,
|
||||||
|
/// Current etcd value, parsed from the broker KV.
|
||||||
|
pub value: V,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
@@ -64,331 +83,127 @@ pub enum BrokerError {
|
|||||||
#[error("Etcd client error: {0}. Context: {1}")]
|
#[error("Etcd client error: {0}. Context: {1}")]
|
||||||
EtcdClient(etcd_client::Error, String),
|
EtcdClient(etcd_client::Error, String),
|
||||||
#[error("Error during parsing etcd key: {0}")]
|
#[error("Error during parsing etcd key: {0}")]
|
||||||
InvalidKey(String),
|
KeyNotParsed(String),
|
||||||
#[error("Error during parsing etcd value: {0}")]
|
|
||||||
ParsingError(String),
|
|
||||||
#[error("Internal error: {0}")]
|
#[error("Internal error: {0}")]
|
||||||
InternalError(String),
|
InternalError(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A way to control the data retrieval from a certain subscription.
|
|
||||||
pub struct SkTimelineSubscription {
|
|
||||||
safekeeper_timeline_updates:
|
|
||||||
mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>>,
|
|
||||||
kind: SkTimelineSubscriptionKind,
|
|
||||||
watcher_handle: JoinHandle<Result<(), BrokerError>>,
|
|
||||||
watcher: Watcher,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SkTimelineSubscription {
|
|
||||||
/// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet.
|
|
||||||
pub async fn fetch_data(
|
|
||||||
&mut self,
|
|
||||||
) -> Option<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>> {
|
|
||||||
self.safekeeper_timeline_updates.recv().await
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Cancels the subscription, stopping the data poller and waiting for it to shut down.
|
|
||||||
pub async fn cancel(mut self) -> Result<(), BrokerError> {
|
|
||||||
self.watcher.cancel().await.map_err(|e| {
|
|
||||||
BrokerError::EtcdClient(
|
|
||||||
e,
|
|
||||||
format!(
|
|
||||||
"Failed to cancel timeline subscription, kind: {:?}",
|
|
||||||
self.kind
|
|
||||||
),
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
self.watcher_handle.await.map_err(|e| {
|
|
||||||
BrokerError::InternalError(format!(
|
|
||||||
"Failed to join the timeline updates task, kind: {:?}, error: {e}",
|
|
||||||
self.kind
|
|
||||||
))
|
|
||||||
})?
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The subscription kind to the timeline updates from safekeeper.
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
|
||||||
pub struct SkTimelineSubscriptionKind {
|
|
||||||
broker_etcd_prefix: String,
|
|
||||||
kind: SubscriptionKind,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SkTimelineSubscriptionKind {
|
|
||||||
pub fn all(broker_etcd_prefix: String) -> Self {
|
|
||||||
Self {
|
|
||||||
broker_etcd_prefix,
|
|
||||||
kind: SubscriptionKind::All,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn tenant(broker_etcd_prefix: String, tenant: ZTenantId) -> Self {
|
|
||||||
Self {
|
|
||||||
broker_etcd_prefix,
|
|
||||||
kind: SubscriptionKind::Tenant(tenant),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn timeline(broker_etcd_prefix: String, timeline: ZTenantTimelineId) -> Self {
|
|
||||||
Self {
|
|
||||||
broker_etcd_prefix,
|
|
||||||
kind: SubscriptionKind::Timeline(timeline),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Etcd key to use for watching a certain timeline updates from safekeepers.
|
|
||||||
pub fn watch_key(&self) -> String {
|
|
||||||
match self.kind {
|
|
||||||
SubscriptionKind::All => self.broker_etcd_prefix.to_string(),
|
|
||||||
SubscriptionKind::Tenant(tenant_id) => {
|
|
||||||
format!("{}/{tenant_id}/safekeeper", self.broker_etcd_prefix)
|
|
||||||
}
|
|
||||||
SubscriptionKind::Timeline(ZTenantTimelineId {
|
|
||||||
tenant_id,
|
|
||||||
timeline_id,
|
|
||||||
}) => format!(
|
|
||||||
"{}/{tenant_id}/{timeline_id}/safekeeper",
|
|
||||||
self.broker_etcd_prefix
|
|
||||||
),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
|
||||||
enum SubscriptionKind {
|
|
||||||
/// Get every timeline update.
|
|
||||||
All,
|
|
||||||
/// Get certain tenant timelines' updates.
|
|
||||||
Tenant(ZTenantId),
|
|
||||||
/// Get certain timeline updates.
|
|
||||||
Timeline(ZTenantTimelineId),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a background task to poll etcd for timeline updates from safekeepers.
|
/// Creates a background task to poll etcd for timeline updates from safekeepers.
|
||||||
/// Stops and returns `Err` on any error during etcd communication.
|
/// Stops and returns `Err` on any error during etcd communication.
|
||||||
/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
|
/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
|
||||||
/// exiting normally in such cases.
|
/// exiting normally in such cases.
|
||||||
pub async fn subscribe_to_safekeeper_timeline_updates(
|
/// Etcd values are parsed as json fukes into a type, specified in the generic patameter.
|
||||||
|
pub async fn subscribe_for_json_values<V>(
|
||||||
client: &mut Client,
|
client: &mut Client,
|
||||||
subscription: SkTimelineSubscriptionKind,
|
key: SubscriptionKey,
|
||||||
) -> Result<SkTimelineSubscription, BrokerError> {
|
) -> Result<BrokerSubscription<V>, BrokerError>
|
||||||
info!("Subscribing to timeline updates, subscription kind: {subscription:?}");
|
where
|
||||||
let kind = subscription.clone();
|
V: DeserializeOwned + Send + 'static,
|
||||||
|
{
|
||||||
|
subscribe_for_values(client, key, |_, value_str| {
|
||||||
|
match serde_json::from_str::<V>(value_str) {
|
||||||
|
Ok(value) => Some(value),
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to parse value str '{value_str}': {e}");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as [`subscribe_for_json_values`], but allows to specify a custom parser of a etcd value string.
|
||||||
|
pub async fn subscribe_for_values<P, V>(
|
||||||
|
client: &mut Client,
|
||||||
|
key: SubscriptionKey,
|
||||||
|
value_parser: P,
|
||||||
|
) -> Result<BrokerSubscription<V>, BrokerError>
|
||||||
|
where
|
||||||
|
V: Send + 'static,
|
||||||
|
P: Fn(SubscriptionFullKey, &str) -> Option<V> + Send + 'static,
|
||||||
|
{
|
||||||
|
info!("Subscribing to broker value updates, key: {key:?}");
|
||||||
|
let subscription_key = key.clone();
|
||||||
|
|
||||||
let (watcher, mut stream) = client
|
let (watcher, mut stream) = client
|
||||||
.watch(
|
.watch(key.watch_key(), Some(WatchOptions::new().with_prefix()))
|
||||||
subscription.watch_key(),
|
|
||||||
Some(WatchOptions::new().with_prefix()),
|
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
.map_err(|e| {
|
.map_err(|e| {
|
||||||
BrokerError::EtcdClient(
|
BrokerError::EtcdClient(
|
||||||
e,
|
e,
|
||||||
format!("Failed to init the watch for subscription {subscription:?}"),
|
format!("Failed to init the watch for subscription {key:?}"),
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel();
|
let (value_updates_sender, value_updates_receiver) = mpsc::unbounded_channel();
|
||||||
let watcher_handle = tokio::spawn(async move {
|
let watcher_handle = tokio::spawn(async move {
|
||||||
while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
|
while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
|
||||||
"Failed to get messages from the subscription stream, kind: {:?}, error: {e}", subscription.kind
|
"Failed to get messages from the subscription stream, kind: {:?}, error: {e}", key.kind
|
||||||
)))? {
|
)))? {
|
||||||
if resp.canceled() {
|
if resp.canceled() {
|
||||||
info!("Watch for timeline updates subscription was canceled, exiting");
|
info!("Watch for timeline updates subscription was canceled, exiting");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>> = HashMap::new();
|
|
||||||
// Keep track that the timeline data updates from etcd arrive in the right order.
|
|
||||||
// https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas
|
|
||||||
// > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering.
|
|
||||||
let mut timeline_etcd_versions: HashMap<ZTenantTimelineId, i64> = HashMap::new();
|
|
||||||
|
|
||||||
|
|
||||||
let events = resp.events();
|
let events = resp.events();
|
||||||
debug!("Processing {} events", events.len());
|
debug!("Processing {} events", events.len());
|
||||||
|
|
||||||
for event in events {
|
for event in events {
|
||||||
if EventType::Put == event.event_type() {
|
if EventType::Put == event.event_type() {
|
||||||
if let Some(new_etcd_kv) = event.kv() {
|
if let Some(new_etcd_kv) = event.kv() {
|
||||||
let new_kv_version = new_etcd_kv.version();
|
match parse_etcd_kv(new_etcd_kv, &value_parser, &key.cluster_prefix) {
|
||||||
let (key_str, value_str) = match extract_key_value_str(new_etcd_kv) {
|
Ok(Some((key, value))) => if let Err(e) = value_updates_sender.send(BrokerUpdate {
|
||||||
Ok(strs) => strs,
|
etcd_version: new_etcd_kv.version(),
|
||||||
Err(e) => {
|
key,
|
||||||
error!("Failed to represent etcd KV {new_etcd_kv:?} as pair of str: {e}");
|
value,
|
||||||
continue;
|
}) {
|
||||||
|
info!("Broker value updates for key {key:?} sender got dropped, exiting: {e}");
|
||||||
|
break;
|
||||||
},
|
},
|
||||||
};
|
Ok(None) => debug!("Ignoring key {key:?} : no value was returned by the parser"),
|
||||||
|
Err(BrokerError::KeyNotParsed(e)) => debug!("Unexpected key {key:?} for timeline update: {e}"),
|
||||||
match parse_safekeeper_timeline(&subscription, key_str, value_str) {
|
Err(e) => error!("Failed to represent etcd KV {new_etcd_kv:?}: {e}"),
|
||||||
Ok((zttid, timeline)) => {
|
|
||||||
match timeline_updates
|
|
||||||
.entry(zttid)
|
|
||||||
.or_default()
|
|
||||||
.entry(timeline.safekeeper_id)
|
|
||||||
{
|
|
||||||
hash_map::Entry::Occupied(mut o) => {
|
|
||||||
let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN);
|
|
||||||
if old_etcd_kv_version < new_kv_version {
|
|
||||||
o.insert(timeline.info);
|
|
||||||
timeline_etcd_versions.insert(zttid,new_kv_version);
|
|
||||||
} else {
|
|
||||||
debug!("Skipping etcd timeline update due to older version compared to one that's already stored");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
hash_map::Entry::Vacant(v) => {
|
|
||||||
v.insert(timeline.info);
|
|
||||||
timeline_etcd_versions.insert(zttid,new_kv_version);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// it is normal to get other keys when we subscribe to everything
|
|
||||||
Err(BrokerError::InvalidKey(e)) => debug!("Unexpected key for timeline update: {e}"),
|
|
||||||
Err(e) => error!("Failed to parse timeline update: {e}"),
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Err(e) = timeline_updates_sender.send(timeline_updates) {
|
|
||||||
info!("Timeline updates sender got dropped, exiting: {e}");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}.instrument(info_span!("etcd_broker")));
|
}.instrument(info_span!("etcd_broker")));
|
||||||
|
|
||||||
Ok(SkTimelineSubscription {
|
Ok(BrokerSubscription {
|
||||||
kind,
|
key: subscription_key,
|
||||||
safekeeper_timeline_updates,
|
value_updates: value_updates_receiver,
|
||||||
watcher_handle,
|
watcher_handle,
|
||||||
watcher,
|
watcher,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_key_value_str(kv: &KeyValue) -> Result<(&str, &str), BrokerError> {
|
fn parse_etcd_kv<P, V>(
|
||||||
let key = kv.key_str().map_err(|e| {
|
kv: &KeyValue,
|
||||||
|
value_parser: &P,
|
||||||
|
cluster_prefix: &str,
|
||||||
|
) -> Result<Option<(SubscriptionFullKey, V)>, BrokerError>
|
||||||
|
where
|
||||||
|
P: Fn(SubscriptionFullKey, &str) -> Option<V>,
|
||||||
|
{
|
||||||
|
let key_str = kv.key_str().map_err(|e| {
|
||||||
BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string())
|
BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string())
|
||||||
})?;
|
})?;
|
||||||
let value = kv.value_str().map_err(|e| {
|
let value_str = kv.value_str().map_err(|e| {
|
||||||
BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string())
|
BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string())
|
||||||
})?;
|
})?;
|
||||||
Ok((key, value))
|
|
||||||
}
|
|
||||||
|
|
||||||
static SK_TIMELINE_KEY_REGEX: Lazy<Regex> = Lazy::new(|| {
|
if !key_str.starts_with(cluster_prefix) {
|
||||||
Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]]+)$")
|
return Err(BrokerError::KeyNotParsed(format!(
|
||||||
.expect("wrong regex for safekeeper timeline etcd key")
|
"KV has unexpected key '{key_str}' that does not start with cluster prefix {cluster_prefix}"
|
||||||
});
|
|
||||||
|
|
||||||
fn parse_safekeeper_timeline(
|
|
||||||
subscription: &SkTimelineSubscriptionKind,
|
|
||||||
key_str: &str,
|
|
||||||
value_str: &str,
|
|
||||||
) -> Result<(ZTenantTimelineId, SafekeeperTimeline), BrokerError> {
|
|
||||||
let broker_prefix = subscription.broker_etcd_prefix.as_str();
|
|
||||||
if !key_str.starts_with(broker_prefix) {
|
|
||||||
return Err(BrokerError::InvalidKey(format!(
|
|
||||||
"KV has unexpected key '{key_str}' that does not start with broker prefix {broker_prefix}"
|
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
let key_part = &key_str[broker_prefix.len()..];
|
let key = SubscriptionFullKey::from_str(&key_str[cluster_prefix.len()..]).map_err(|e| {
|
||||||
let key_captures = match SK_TIMELINE_KEY_REGEX.captures(key_part) {
|
BrokerError::KeyNotParsed(format!("Failed to parse KV key '{key_str}': {e}"))
|
||||||
Some(captures) => captures,
|
|
||||||
None => {
|
|
||||||
return Err(BrokerError::InvalidKey(format!(
|
|
||||||
"KV has unexpected key part '{key_part}' that does not match required regex {}",
|
|
||||||
SK_TIMELINE_KEY_REGEX.as_str()
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let info = serde_json::from_str(value_str).map_err(|e| {
|
|
||||||
BrokerError::ParsingError(format!(
|
|
||||||
"Failed to parse '{value_str}' as safekeeper timeline info: {e}"
|
|
||||||
))
|
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let zttid = ZTenantTimelineId::new(
|
Ok(value_parser(key, value_str).map(|value| (key, value)))
|
||||||
parse_capture(&key_captures, 1).map_err(BrokerError::ParsingError)?,
|
|
||||||
parse_capture(&key_captures, 2).map_err(BrokerError::ParsingError)?,
|
|
||||||
);
|
|
||||||
let safekeeper_id = NodeId(parse_capture(&key_captures, 3).map_err(BrokerError::ParsingError)?);
|
|
||||||
|
|
||||||
Ok((
|
|
||||||
zttid,
|
|
||||||
SafekeeperTimeline {
|
|
||||||
safekeeper_id,
|
|
||||||
info,
|
|
||||||
},
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
|
|
||||||
where
|
|
||||||
T: FromStr,
|
|
||||||
<T as FromStr>::Err: Display,
|
|
||||||
{
|
|
||||||
let capture_match = caps
|
|
||||||
.get(index)
|
|
||||||
.ok_or_else(|| format!("Failed to get capture match at index {index}"))?
|
|
||||||
.as_str();
|
|
||||||
capture_match.parse().map_err(|e| {
|
|
||||||
format!(
|
|
||||||
"Failed to parse {} from {capture_match}: {e}",
|
|
||||||
std::any::type_name::<T>()
|
|
||||||
)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use utils::zid::ZTimelineId;
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn typical_etcd_prefix_should_be_parsed() {
|
|
||||||
let prefix = "neon";
|
|
||||||
let tenant_id = ZTenantId::generate();
|
|
||||||
let timeline_id = ZTimelineId::generate();
|
|
||||||
let all_subscription = SkTimelineSubscriptionKind {
|
|
||||||
broker_etcd_prefix: prefix.to_string(),
|
|
||||||
kind: SubscriptionKind::All,
|
|
||||||
};
|
|
||||||
let tenant_subscription = SkTimelineSubscriptionKind {
|
|
||||||
broker_etcd_prefix: prefix.to_string(),
|
|
||||||
kind: SubscriptionKind::Tenant(tenant_id),
|
|
||||||
};
|
|
||||||
let timeline_subscription = SkTimelineSubscriptionKind {
|
|
||||||
broker_etcd_prefix: prefix.to_string(),
|
|
||||||
kind: SubscriptionKind::Timeline(ZTenantTimelineId::new(tenant_id, timeline_id)),
|
|
||||||
};
|
|
||||||
|
|
||||||
let typical_etcd_kv_strs = [
|
|
||||||
(
|
|
||||||
format!("{prefix}/{tenant_id}/{timeline_id}/safekeeper/1"),
|
|
||||||
r#"{"last_log_term":231,"flush_lsn":"0/241BB70","commit_lsn":"0/241BB70","backup_lsn":"0/2000000","remote_consistent_lsn":"0/0","peer_horizon_lsn":"0/16960E8","safekeeper_connstr":"something.local:1234","pageserver_connstr":"postgresql://(null):@somethine.else.local:3456"}"#,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
format!("{prefix}/{tenant_id}/{timeline_id}/safekeeper/13"),
|
|
||||||
r#"{"last_log_term":231,"flush_lsn":"0/241BB70","commit_lsn":"0/241BB70","backup_lsn":"0/2000000","remote_consistent_lsn":"0/0","peer_horizon_lsn":"0/16960E8","safekeeper_connstr":"something.local:1234","pageserver_connstr":"postgresql://(null):@somethine.else.local:3456"}"#,
|
|
||||||
),
|
|
||||||
];
|
|
||||||
|
|
||||||
for (key_string, value_str) in typical_etcd_kv_strs {
|
|
||||||
for subscription in [
|
|
||||||
&all_subscription,
|
|
||||||
&tenant_subscription,
|
|
||||||
&timeline_subscription,
|
|
||||||
] {
|
|
||||||
let (id, _timeline) =
|
|
||||||
parse_safekeeper_timeline(subscription, &key_string, value_str)
|
|
||||||
.unwrap_or_else(|e| panic!("Should be able to parse etcd key string '{key_string}' and etcd value string '{value_str}' for subscription {subscription:?}, but got: {e}"));
|
|
||||||
assert_eq!(id, ZTenantTimelineId::new(tenant_id, timeline_id));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
310
libs/etcd_broker/src/subscription_key.rs
Normal file
310
libs/etcd_broker/src/subscription_key.rs
Normal file
@@ -0,0 +1,310 @@
|
|||||||
|
//! Etcd broker keys, used in the project and shared between instances.
|
||||||
|
//! The keys are split into two categories:
|
||||||
|
//!
|
||||||
|
//! * [`SubscriptionFullKey`] full key format: `<cluster_prefix>/<tenant>/<timeline>/<node_kind>/<operation>/<node_id>`
|
||||||
|
//! Always returned from etcd in this form, always start with the user key provided.
|
||||||
|
//!
|
||||||
|
//! * [`SubscriptionKey`] user input key format: always partial, since it's unknown which `node_id`'s are available.
|
||||||
|
//! Full key always starts with the user input one, due to etcd subscription properties.
|
||||||
|
|
||||||
|
use std::{fmt::Display, str::FromStr};
|
||||||
|
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use regex::{Captures, Regex};
|
||||||
|
use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId};
|
||||||
|
|
||||||
|
/// The subscription kind to the timeline updates from safekeeper.
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub struct SubscriptionKey {
|
||||||
|
/// Generic cluster prefix, allowing to use the same etcd instance by multiple logic groups.
|
||||||
|
pub cluster_prefix: String,
|
||||||
|
/// The subscription kind.
|
||||||
|
pub kind: SubscriptionKind,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// All currently possible key kinds of a etcd broker subscription.
|
||||||
|
/// Etcd works so, that every key that starts with the subbscription key given is considered matching and
|
||||||
|
/// returned as part of the subscrption.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum SubscriptionKind {
|
||||||
|
/// Get every update in etcd.
|
||||||
|
All,
|
||||||
|
/// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind.
|
||||||
|
TenantTimelines(ZTenantId),
|
||||||
|
/// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind.
|
||||||
|
Timeline(ZTenantTimelineId),
|
||||||
|
/// Get etcd timeline updates, specific to a certain node kind.
|
||||||
|
Node(ZTenantTimelineId, NodeKind),
|
||||||
|
/// Get etcd timeline updates for a certain operation on specific nodes.
|
||||||
|
Operation(ZTenantTimelineId, NodeKind, OperationKind),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// All kinds of nodes, able to write into etcd.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum NodeKind {
|
||||||
|
Safekeeper,
|
||||||
|
Pageserver,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum OperationKind {
|
||||||
|
Safekeeper(SkOperationKind),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Current operations, running inside the safekeeper node.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum SkOperationKind {
|
||||||
|
TimelineInfo,
|
||||||
|
WalBackup,
|
||||||
|
}
|
||||||
|
|
||||||
|
static SUBSCRIPTION_FULL_KEY_REGEX: Lazy<Regex> = Lazy::new(|| {
|
||||||
|
Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/([^/]+)/([^/]+)/([[:digit:]]+)$")
|
||||||
|
.expect("wrong subscription full etcd key regex")
|
||||||
|
});
|
||||||
|
|
||||||
|
/// Full key, received from etcd during any of the component's work.
|
||||||
|
/// No other etcd keys are considered during system's work.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub struct SubscriptionFullKey {
|
||||||
|
pub id: ZTenantTimelineId,
|
||||||
|
pub node_kind: NodeKind,
|
||||||
|
pub operation: OperationKind,
|
||||||
|
pub node_id: NodeId,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SubscriptionKey {
|
||||||
|
/// Subscribes for all etcd updates.
|
||||||
|
pub fn all(cluster_prefix: String) -> Self {
|
||||||
|
SubscriptionKey {
|
||||||
|
cluster_prefix,
|
||||||
|
kind: SubscriptionKind::All,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Subscribes to a given timeline info updates from safekeepers.
|
||||||
|
pub fn sk_timeline_info(cluster_prefix: String, timeline: ZTenantTimelineId) -> Self {
|
||||||
|
Self {
|
||||||
|
cluster_prefix,
|
||||||
|
kind: SubscriptionKind::Operation(
|
||||||
|
timeline,
|
||||||
|
NodeKind::Safekeeper,
|
||||||
|
OperationKind::Safekeeper(SkOperationKind::TimelineInfo),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Subscribes to all timeine updates during specific operations, running on the corresponding nodes.
|
||||||
|
pub fn operation(
|
||||||
|
cluster_prefix: String,
|
||||||
|
timeline: ZTenantTimelineId,
|
||||||
|
node_kind: NodeKind,
|
||||||
|
operation: OperationKind,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
cluster_prefix,
|
||||||
|
kind: SubscriptionKind::Operation(timeline, node_kind, operation),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Etcd key to use for watching a certain timeline updates from safekeepers.
|
||||||
|
pub fn watch_key(&self) -> String {
|
||||||
|
let cluster_prefix = &self.cluster_prefix;
|
||||||
|
match self.kind {
|
||||||
|
SubscriptionKind::All => cluster_prefix.to_string(),
|
||||||
|
SubscriptionKind::TenantTimelines(tenant_id) => {
|
||||||
|
format!("{cluster_prefix}/{tenant_id}")
|
||||||
|
}
|
||||||
|
SubscriptionKind::Timeline(id) => {
|
||||||
|
format!("{cluster_prefix}/{id}")
|
||||||
|
}
|
||||||
|
SubscriptionKind::Node(id, node_kind) => {
|
||||||
|
format!("{cluster_prefix}/{id}/{node_kind}")
|
||||||
|
}
|
||||||
|
SubscriptionKind::Operation(id, node_kind, operation_kind) => {
|
||||||
|
format!("{cluster_prefix}/{id}/{node_kind}/{operation_kind}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for OperationKind {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
OperationKind::Safekeeper(o) => o.fmt(f),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for OperationKind {
|
||||||
|
type Err = String;
|
||||||
|
|
||||||
|
fn from_str(operation_kind_str: &str) -> Result<Self, Self::Err> {
|
||||||
|
match operation_kind_str {
|
||||||
|
"timeline_info" => Ok(OperationKind::Safekeeper(SkOperationKind::TimelineInfo)),
|
||||||
|
"wal_backup" => Ok(OperationKind::Safekeeper(SkOperationKind::WalBackup)),
|
||||||
|
_ => Err(format!("Unknown operation kind: {operation_kind_str}")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for SubscriptionFullKey {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
let Self {
|
||||||
|
id,
|
||||||
|
node_kind,
|
||||||
|
operation,
|
||||||
|
node_id,
|
||||||
|
} = self;
|
||||||
|
write!(f, "{id}/{node_kind}/{operation}/{node_id}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for SubscriptionFullKey {
|
||||||
|
type Err = String;
|
||||||
|
|
||||||
|
fn from_str(subscription_kind_str: &str) -> Result<Self, Self::Err> {
|
||||||
|
let key_captures = match SUBSCRIPTION_FULL_KEY_REGEX.captures(subscription_kind_str) {
|
||||||
|
Some(captures) => captures,
|
||||||
|
None => {
|
||||||
|
return Err(format!(
|
||||||
|
"Subscription kind str does not match a subscription full key regex {}",
|
||||||
|
SUBSCRIPTION_FULL_KEY_REGEX.as_str()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
id: ZTenantTimelineId::new(
|
||||||
|
parse_capture(&key_captures, 1)?,
|
||||||
|
parse_capture(&key_captures, 2)?,
|
||||||
|
),
|
||||||
|
node_kind: parse_capture(&key_captures, 3)?,
|
||||||
|
operation: parse_capture(&key_captures, 4)?,
|
||||||
|
node_id: NodeId(parse_capture(&key_captures, 5)?),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
|
||||||
|
where
|
||||||
|
T: FromStr,
|
||||||
|
<T as FromStr>::Err: Display,
|
||||||
|
{
|
||||||
|
let capture_match = caps
|
||||||
|
.get(index)
|
||||||
|
.ok_or_else(|| format!("Failed to get capture match at index {index}"))?
|
||||||
|
.as_str();
|
||||||
|
capture_match.parse().map_err(|e| {
|
||||||
|
format!(
|
||||||
|
"Failed to parse {} from {capture_match}: {e}",
|
||||||
|
std::any::type_name::<T>()
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for NodeKind {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
Self::Safekeeper => write!(f, "safekeeper"),
|
||||||
|
Self::Pageserver => write!(f, "pageserver"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for NodeKind {
|
||||||
|
type Err = String;
|
||||||
|
|
||||||
|
fn from_str(node_kind_str: &str) -> Result<Self, Self::Err> {
|
||||||
|
match node_kind_str {
|
||||||
|
"safekeeper" => Ok(Self::Safekeeper),
|
||||||
|
"pageserver" => Ok(Self::Pageserver),
|
||||||
|
_ => Err(format!("Invalid node kind: {node_kind_str}")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for SkOperationKind {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
Self::TimelineInfo => write!(f, "timeline_info"),
|
||||||
|
Self::WalBackup => write!(f, "wal_backup"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for SkOperationKind {
|
||||||
|
type Err = String;
|
||||||
|
|
||||||
|
fn from_str(operation_str: &str) -> Result<Self, Self::Err> {
|
||||||
|
match operation_str {
|
||||||
|
"timeline_info" => Ok(Self::TimelineInfo),
|
||||||
|
"wal_backup" => Ok(Self::WalBackup),
|
||||||
|
_ => Err(format!("Invalid operation: {operation_str}")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use utils::zid::ZTimelineId;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn full_cluster_key_parsing() {
|
||||||
|
let prefix = "neon";
|
||||||
|
let node_kind = NodeKind::Safekeeper;
|
||||||
|
let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup);
|
||||||
|
let tenant_id = ZTenantId::generate();
|
||||||
|
let timeline_id = ZTimelineId::generate();
|
||||||
|
let id = ZTenantTimelineId::new(tenant_id, timeline_id);
|
||||||
|
let node_id = NodeId(1);
|
||||||
|
|
||||||
|
let timeline_subscription_keys = [
|
||||||
|
SubscriptionKey {
|
||||||
|
cluster_prefix: prefix.to_string(),
|
||||||
|
kind: SubscriptionKind::All,
|
||||||
|
},
|
||||||
|
SubscriptionKey {
|
||||||
|
cluster_prefix: prefix.to_string(),
|
||||||
|
kind: SubscriptionKind::TenantTimelines(tenant_id),
|
||||||
|
},
|
||||||
|
SubscriptionKey {
|
||||||
|
cluster_prefix: prefix.to_string(),
|
||||||
|
kind: SubscriptionKind::Timeline(id),
|
||||||
|
},
|
||||||
|
SubscriptionKey {
|
||||||
|
cluster_prefix: prefix.to_string(),
|
||||||
|
kind: SubscriptionKind::Node(id, node_kind),
|
||||||
|
},
|
||||||
|
SubscriptionKey {
|
||||||
|
cluster_prefix: prefix.to_string(),
|
||||||
|
kind: SubscriptionKind::Operation(id, node_kind, operation_kind),
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
let full_key_string = format!(
|
||||||
|
"{}/{node_id}",
|
||||||
|
timeline_subscription_keys.last().unwrap().watch_key()
|
||||||
|
);
|
||||||
|
|
||||||
|
for key in timeline_subscription_keys {
|
||||||
|
assert!(full_key_string.starts_with(&key.watch_key()), "Full key '{full_key_string}' should start with any of the keys, keys, but {key:?} did not match");
|
||||||
|
}
|
||||||
|
|
||||||
|
let full_key = SubscriptionFullKey::from_str(&full_key_string).unwrap_or_else(|e| {
|
||||||
|
panic!("Failed to parse {full_key_string} as a subscription full key: {e}")
|
||||||
|
});
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
full_key,
|
||||||
|
SubscriptionFullKey {
|
||||||
|
id,
|
||||||
|
node_kind,
|
||||||
|
operation: operation_kind,
|
||||||
|
node_id
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
35
libs/etcd_broker/src/subscription_value.rs
Normal file
35
libs/etcd_broker/src/subscription_value.rs
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
//! Module for the values to put into etcd.
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_with::{serde_as, DisplayFromStr};
|
||||||
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
|
/// Data about safekeeper's timeline. Fields made optional for easy migrations.
|
||||||
|
#[serde_as]
|
||||||
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||||
|
pub struct SkTimelineInfo {
|
||||||
|
/// Term of the last entry.
|
||||||
|
pub last_log_term: Option<u64>,
|
||||||
|
/// LSN of the last record.
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub flush_lsn: Option<Lsn>,
|
||||||
|
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub commit_lsn: Option<Lsn>,
|
||||||
|
/// LSN up to which safekeeper has backed WAL.
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub backup_lsn: Option<Lsn>,
|
||||||
|
/// LSN of last checkpoint uploaded by pageserver.
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub remote_consistent_lsn: Option<Lsn>,
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub peer_horizon_lsn: Option<Lsn>,
|
||||||
|
/// A connection string to use for WAL receiving.
|
||||||
|
#[serde(default)]
|
||||||
|
pub safekeeper_connstr: Option<String>,
|
||||||
|
}
|
||||||
@@ -4,6 +4,7 @@ use log::*;
|
|||||||
use postgres::types::PgLsn;
|
use postgres::types::PgLsn;
|
||||||
use postgres::Client;
|
use postgres::Client;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
use std::fs;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::process::{Command, Stdio};
|
use std::process::{Command, Stdio};
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
@@ -69,6 +70,12 @@ impl Conf {
|
|||||||
|
|
||||||
pub fn start_server(&self) -> Result<PostgresServer> {
|
pub fn start_server(&self) -> Result<PostgresServer> {
|
||||||
info!("Starting Postgres server in {:?}", self.datadir);
|
info!("Starting Postgres server in {:?}", self.datadir);
|
||||||
|
let log_file = fs::File::create(self.datadir.join("pg.log")).with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to create pg.log file in directory {}",
|
||||||
|
self.datadir.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
|
let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
|
||||||
let unix_socket_dir_path = unix_socket_dir.path().to_owned();
|
let unix_socket_dir_path = unix_socket_dir.path().to_owned();
|
||||||
let server_process = self
|
let server_process = self
|
||||||
@@ -84,7 +91,7 @@ impl Conf {
|
|||||||
// Disable background processes as much as possible
|
// Disable background processes as much as possible
|
||||||
.args(&["-c", "wal_writer_delay=10s"])
|
.args(&["-c", "wal_writer_delay=10s"])
|
||||||
.args(&["-c", "autovacuum=off"])
|
.args(&["-c", "autovacuum=off"])
|
||||||
.stderr(Stdio::null())
|
.stderr(Stdio::from(log_file))
|
||||||
.spawn()?;
|
.spawn()?;
|
||||||
let server = PostgresServer {
|
let server = PostgresServer {
|
||||||
process: server_process,
|
process: server_process,
|
||||||
|
|||||||
@@ -13,13 +13,10 @@ use std::fmt;
|
|||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
use std::net::{Shutdown, SocketAddr, TcpStream};
|
use std::net::{Shutdown, SocketAddr, TcpStream};
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
static PGBACKEND_SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
|
|
||||||
|
|
||||||
pub trait Handler {
|
pub trait Handler {
|
||||||
/// Handle single query.
|
/// Handle single query.
|
||||||
/// postgres_backend will issue ReadyForQuery after calling this (this
|
/// postgres_backend will issue ReadyForQuery after calling this (this
|
||||||
@@ -45,6 +42,10 @@ pub trait Handler {
|
|||||||
fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
|
fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
|
||||||
bail!("JWT auth failed")
|
bail!("JWT auth failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_shutdown_requested(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// PostgresBackend protocol state.
|
/// PostgresBackend protocol state.
|
||||||
@@ -274,7 +275,7 @@ impl PostgresBackend {
|
|||||||
|
|
||||||
let mut unnamed_query_string = Bytes::new();
|
let mut unnamed_query_string = Bytes::new();
|
||||||
|
|
||||||
while !PGBACKEND_SHUTDOWN_REQUESTED.load(Ordering::Relaxed) {
|
while !handler.is_shutdown_requested() {
|
||||||
match self.read_message() {
|
match self.read_message() {
|
||||||
Ok(message) => {
|
Ok(message) => {
|
||||||
if let Some(msg) = message {
|
if let Some(msg) = message {
|
||||||
@@ -493,8 +494,3 @@ impl PostgresBackend {
|
|||||||
Ok(ProcessMsgResult::Continue)
|
Ok(ProcessMsgResult::Continue)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the flag to inform connections to cancel
|
|
||||||
pub fn set_pgbackend_shutdown_requested() {
|
|
||||||
PGBACKEND_SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -926,10 +926,10 @@ impl<'a> BeMessage<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Zenith extension of postgres replication protocol
|
// Neon extension of postgres replication protocol
|
||||||
// See ZENITH_STATUS_UPDATE_TAG_BYTE
|
// See NEON_STATUS_UPDATE_TAG_BYTE
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
||||||
pub struct ZenithFeedback {
|
pub struct ReplicationFeedback {
|
||||||
// Last known size of the timeline. Used to enforce timeline size limit.
|
// Last known size of the timeline. Used to enforce timeline size limit.
|
||||||
pub current_timeline_size: u64,
|
pub current_timeline_size: u64,
|
||||||
// Parts of StandbyStatusUpdate we resend to compute via safekeeper
|
// Parts of StandbyStatusUpdate we resend to compute via safekeeper
|
||||||
@@ -939,13 +939,13 @@ pub struct ZenithFeedback {
|
|||||||
pub ps_replytime: SystemTime,
|
pub ps_replytime: SystemTime,
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: Do not forget to increment this number when adding new fields to ZenithFeedback.
|
// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback.
|
||||||
// Do not remove previously available fields because this might be backwards incompatible.
|
// Do not remove previously available fields because this might be backwards incompatible.
|
||||||
pub const ZENITH_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
||||||
|
|
||||||
impl ZenithFeedback {
|
impl ReplicationFeedback {
|
||||||
pub fn empty() -> ZenithFeedback {
|
pub fn empty() -> ReplicationFeedback {
|
||||||
ZenithFeedback {
|
ReplicationFeedback {
|
||||||
current_timeline_size: 0,
|
current_timeline_size: 0,
|
||||||
ps_writelsn: 0,
|
ps_writelsn: 0,
|
||||||
ps_applylsn: 0,
|
ps_applylsn: 0,
|
||||||
@@ -954,7 +954,7 @@ impl ZenithFeedback {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Serialize ZenithFeedback using custom format
|
// Serialize ReplicationFeedback using custom format
|
||||||
// to support protocol extensibility.
|
// to support protocol extensibility.
|
||||||
//
|
//
|
||||||
// Following layout is used:
|
// Following layout is used:
|
||||||
@@ -965,7 +965,7 @@ impl ZenithFeedback {
|
|||||||
// uint32 - value length in bytes
|
// uint32 - value length in bytes
|
||||||
// value itself
|
// value itself
|
||||||
pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
|
pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
|
||||||
buf.put_u8(ZENITH_FEEDBACK_FIELDS_NUMBER); // # of keys
|
buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||||
write_cstr(&Bytes::from("current_timeline_size"), buf)?;
|
write_cstr(&Bytes::from("current_timeline_size"), buf)?;
|
||||||
buf.put_i32(8);
|
buf.put_i32(8);
|
||||||
buf.put_u64(self.current_timeline_size);
|
buf.put_u64(self.current_timeline_size);
|
||||||
@@ -992,9 +992,9 @@ impl ZenithFeedback {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// Deserialize ZenithFeedback message
|
// Deserialize ReplicationFeedback message
|
||||||
pub fn parse(mut buf: Bytes) -> ZenithFeedback {
|
pub fn parse(mut buf: Bytes) -> ReplicationFeedback {
|
||||||
let mut zf = ZenithFeedback::empty();
|
let mut zf = ReplicationFeedback::empty();
|
||||||
let nfields = buf.get_u8();
|
let nfields = buf.get_u8();
|
||||||
let mut i = 0;
|
let mut i = 0;
|
||||||
while i < nfields {
|
while i < nfields {
|
||||||
@@ -1035,14 +1035,14 @@ impl ZenithFeedback {
|
|||||||
_ => {
|
_ => {
|
||||||
let len = buf.get_i32();
|
let len = buf.get_i32();
|
||||||
warn!(
|
warn!(
|
||||||
"ZenithFeedback parse. unknown key {} of len {}. Skip it.",
|
"ReplicationFeedback parse. unknown key {} of len {}. Skip it.",
|
||||||
key, len
|
key, len
|
||||||
);
|
);
|
||||||
buf.advance(len as usize);
|
buf.advance(len as usize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
trace!("ZenithFeedback parsed is {:?}", zf);
|
trace!("ReplicationFeedback parsed is {:?}", zf);
|
||||||
zf
|
zf
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1052,8 +1052,8 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_zenithfeedback_serialization() {
|
fn test_replication_feedback_serialization() {
|
||||||
let mut zf = ZenithFeedback::empty();
|
let mut zf = ReplicationFeedback::empty();
|
||||||
// Fill zf with some values
|
// Fill zf with some values
|
||||||
zf.current_timeline_size = 12345678;
|
zf.current_timeline_size = 12345678;
|
||||||
// Set rounded time to be able to compare it with deserialized value,
|
// Set rounded time to be able to compare it with deserialized value,
|
||||||
@@ -1062,13 +1062,13 @@ mod tests {
|
|||||||
let mut data = BytesMut::new();
|
let mut data = BytesMut::new();
|
||||||
zf.serialize(&mut data).unwrap();
|
zf.serialize(&mut data).unwrap();
|
||||||
|
|
||||||
let zf_parsed = ZenithFeedback::parse(data.freeze());
|
let zf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||||
assert_eq!(zf, zf_parsed);
|
assert_eq!(zf, zf_parsed);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_zenithfeedback_unknown_key() {
|
fn test_replication_feedback_unknown_key() {
|
||||||
let mut zf = ZenithFeedback::empty();
|
let mut zf = ReplicationFeedback::empty();
|
||||||
// Fill zf with some values
|
// Fill zf with some values
|
||||||
zf.current_timeline_size = 12345678;
|
zf.current_timeline_size = 12345678;
|
||||||
// Set rounded time to be able to compare it with deserialized value,
|
// Set rounded time to be able to compare it with deserialized value,
|
||||||
@@ -1079,7 +1079,7 @@ mod tests {
|
|||||||
|
|
||||||
// Add an extra field to the buffer and adjust number of keys
|
// Add an extra field to the buffer and adjust number of keys
|
||||||
if let Some(first) = data.first_mut() {
|
if let Some(first) = data.first_mut() {
|
||||||
*first = ZENITH_FEEDBACK_FIELDS_NUMBER + 1;
|
*first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap();
|
write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap();
|
||||||
@@ -1087,7 +1087,7 @@ mod tests {
|
|||||||
data.put_u64(42);
|
data.put_u64(42);
|
||||||
|
|
||||||
// Parse serialized data and check that new field is not parsed
|
// Parse serialized data and check that new field is not parsed
|
||||||
let zf_parsed = ZenithFeedback::parse(data.freeze());
|
let zf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||||
assert_eq!(zf, zf_parsed);
|
assert_eq!(zf, zf_parsed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ use safekeeper::defaults::{
|
|||||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||||
};
|
};
|
||||||
use std::collections::{BTreeSet, HashMap};
|
use std::collections::{BTreeSet, HashMap};
|
||||||
use std::path::Path;
|
use std::path::{Path, PathBuf};
|
||||||
use std::process::exit;
|
use std::process::exit;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use utils::{
|
use utils::{
|
||||||
@@ -159,6 +159,20 @@ fn main() -> Result<()> {
|
|||||||
.about("Create a new blank timeline")
|
.about("Create a new blank timeline")
|
||||||
.arg(tenant_id_arg.clone())
|
.arg(tenant_id_arg.clone())
|
||||||
.arg(branch_name_arg.clone()))
|
.arg(branch_name_arg.clone()))
|
||||||
|
.subcommand(App::new("import")
|
||||||
|
.about("Import timeline from basebackup directory")
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(timeline_id_arg.clone())
|
||||||
|
.arg(Arg::new("node-name").long("node-name").takes_value(true)
|
||||||
|
.help("Name to assign to the imported timeline"))
|
||||||
|
.arg(Arg::new("base-tarfile").long("base-tarfile").takes_value(true)
|
||||||
|
.help("Basebackup tarfile to import"))
|
||||||
|
.arg(Arg::new("base-lsn").long("base-lsn").takes_value(true)
|
||||||
|
.help("Lsn the basebackup starts at"))
|
||||||
|
.arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true)
|
||||||
|
.help("Wal to add after base"))
|
||||||
|
.arg(Arg::new("end-lsn").long("end-lsn").takes_value(true)
|
||||||
|
.help("Lsn the basebackup ends at")))
|
||||||
).subcommand(
|
).subcommand(
|
||||||
App::new("tenant")
|
App::new("tenant")
|
||||||
.setting(AppSettings::ArgRequiredElseHelp)
|
.setting(AppSettings::ArgRequiredElseHelp)
|
||||||
@@ -613,6 +627,43 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
|||||||
timeline.timeline_id, last_record_lsn, tenant_id,
|
timeline.timeline_id, last_record_lsn, tenant_id,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
Some(("import", import_match)) => {
|
||||||
|
let tenant_id = get_tenant_id(import_match, env)?;
|
||||||
|
let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided");
|
||||||
|
let name = import_match
|
||||||
|
.value_of("node-name")
|
||||||
|
.ok_or_else(|| anyhow!("No node name provided"))?;
|
||||||
|
|
||||||
|
// Parse base inputs
|
||||||
|
let base_tarfile = import_match
|
||||||
|
.value_of("base-tarfile")
|
||||||
|
.map(|s| PathBuf::from_str(s).unwrap())
|
||||||
|
.ok_or_else(|| anyhow!("No base-tarfile provided"))?;
|
||||||
|
let base_lsn = Lsn::from_str(
|
||||||
|
import_match
|
||||||
|
.value_of("base-lsn")
|
||||||
|
.ok_or_else(|| anyhow!("No base-lsn provided"))?,
|
||||||
|
)?;
|
||||||
|
let base = (base_lsn, base_tarfile);
|
||||||
|
|
||||||
|
// Parse pg_wal inputs
|
||||||
|
let wal_tarfile = import_match
|
||||||
|
.value_of("wal-tarfile")
|
||||||
|
.map(|s| PathBuf::from_str(s).unwrap());
|
||||||
|
let end_lsn = import_match
|
||||||
|
.value_of("end-lsn")
|
||||||
|
.map(|s| Lsn::from_str(s).unwrap());
|
||||||
|
// TODO validate both or none are provided
|
||||||
|
let pg_wal = end_lsn.zip(wal_tarfile);
|
||||||
|
|
||||||
|
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||||
|
println!("Importing timeline into pageserver ...");
|
||||||
|
pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal)?;
|
||||||
|
println!("Creating node for imported timeline ...");
|
||||||
|
env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
|
||||||
|
cplane.new_node(tenant_id, name, timeline_id, None, None)?;
|
||||||
|
println!("Done");
|
||||||
|
}
|
||||||
Some(("branch", branch_match)) => {
|
Some(("branch", branch_match)) => {
|
||||||
let tenant_id = get_tenant_id(branch_match, env)?;
|
let tenant_id = get_tenant_id(branch_match, env)?;
|
||||||
let new_branch_name = branch_match
|
let new_branch_name = branch_match
|
||||||
|
|||||||
@@ -61,6 +61,7 @@ utils = { path = "../libs/utils" }
|
|||||||
remote_storage = { path = "../libs/remote_storage" }
|
remote_storage = { path = "../libs/remote_storage" }
|
||||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
close_fds = "0.3.2"
|
close_fds = "0.3.2"
|
||||||
|
walkdir = "2.3.2"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
hex-literal = "0.3"
|
hex-literal = "0.3"
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ Repository
|
|||||||
|
|
||||||
The repository stores all the page versions, or WAL records needed to
|
The repository stores all the page versions, or WAL records needed to
|
||||||
reconstruct them. Each tenant has a separate Repository, which is
|
reconstruct them. Each tenant has a separate Repository, which is
|
||||||
stored in the .zenith/tenants/<tenantid> directory.
|
stored in the .neon/tenants/<tenantid> directory.
|
||||||
|
|
||||||
Repository is an abstract trait, defined in `repository.rs`. It is
|
Repository is an abstract trait, defined in `repository.rs`. It is
|
||||||
implemented by the LayeredRepository object in
|
implemented by the LayeredRepository object in
|
||||||
@@ -92,7 +92,7 @@ Each repository also has a WAL redo manager associated with it, see
|
|||||||
records, whenever we need to reconstruct a page version from WAL to
|
records, whenever we need to reconstruct a page version from WAL to
|
||||||
satisfy a GetPage@LSN request, or to avoid accumulating too much WAL
|
satisfy a GetPage@LSN request, or to avoid accumulating too much WAL
|
||||||
for a page. The WAL redo manager uses a Postgres process running in
|
for a page. The WAL redo manager uses a Postgres process running in
|
||||||
special zenith wal-redo mode to do the actual WAL redo, and
|
special Neon wal-redo mode to do the actual WAL redo, and
|
||||||
communicates with the process using a pipe.
|
communicates with the process using a pipe.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||||
use bytes::{BufMut, BytesMut};
|
use bytes::{BufMut, BytesMut};
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
|
use itertools::Itertools;
|
||||||
use std::fmt::Write as FmtWrite;
|
use std::fmt::Write as FmtWrite;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
@@ -21,7 +22,7 @@ use std::time::SystemTime;
|
|||||||
use tar::{Builder, EntryType, Header};
|
use tar::{Builder, EntryType, Header};
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
use crate::reltag::SlruKind;
|
use crate::reltag::{RelTag, SlruKind};
|
||||||
use crate::repository::Timeline;
|
use crate::repository::Timeline;
|
||||||
use crate::DatadirTimelineImpl;
|
use crate::DatadirTimelineImpl;
|
||||||
use postgres_ffi::xlog_utils::*;
|
use postgres_ffi::xlog_utils::*;
|
||||||
@@ -39,11 +40,12 @@ where
|
|||||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||||
pub lsn: Lsn,
|
pub lsn: Lsn,
|
||||||
prev_record_lsn: Lsn,
|
prev_record_lsn: Lsn,
|
||||||
|
full_backup: bool,
|
||||||
finished: bool,
|
finished: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create basebackup with non-rel data in it. Omit relational data.
|
// Create basebackup with non-rel data in it.
|
||||||
|
// Only include relational data if 'full_backup' is true.
|
||||||
//
|
//
|
||||||
// Currently we use empty lsn in two cases:
|
// Currently we use empty lsn in two cases:
|
||||||
// * During the basebackup right after timeline creation
|
// * During the basebackup right after timeline creation
|
||||||
@@ -58,6 +60,7 @@ where
|
|||||||
write: W,
|
write: W,
|
||||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||||
req_lsn: Option<Lsn>,
|
req_lsn: Option<Lsn>,
|
||||||
|
full_backup: bool,
|
||||||
) -> Result<Basebackup<'a, W>> {
|
) -> Result<Basebackup<'a, W>> {
|
||||||
// Compute postgres doesn't have any previous WAL files, but the first
|
// Compute postgres doesn't have any previous WAL files, but the first
|
||||||
// record that it's going to write needs to include the LSN of the
|
// record that it's going to write needs to include the LSN of the
|
||||||
@@ -94,8 +97,8 @@ where
|
|||||||
};
|
};
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"taking basebackup lsn={}, prev_lsn={}",
|
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
|
||||||
backup_lsn, backup_prev
|
backup_lsn, backup_prev, full_backup
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(Basebackup {
|
Ok(Basebackup {
|
||||||
@@ -103,11 +106,14 @@ where
|
|||||||
timeline,
|
timeline,
|
||||||
lsn: backup_lsn,
|
lsn: backup_lsn,
|
||||||
prev_record_lsn: backup_prev,
|
prev_record_lsn: backup_prev,
|
||||||
|
full_backup,
|
||||||
finished: false,
|
finished: false,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn send_tarball(mut self) -> anyhow::Result<()> {
|
pub fn send_tarball(mut self) -> anyhow::Result<()> {
|
||||||
|
// TODO include checksum
|
||||||
|
|
||||||
// Create pgdata subdirs structure
|
// Create pgdata subdirs structure
|
||||||
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
||||||
let header = new_tar_header_dir(*dir)?;
|
let header = new_tar_header_dir(*dir)?;
|
||||||
@@ -140,6 +146,13 @@ where
|
|||||||
// Create tablespace directories
|
// Create tablespace directories
|
||||||
for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
|
for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
|
||||||
self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
|
self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
|
||||||
|
|
||||||
|
// Gather and send relational files in each database if full backup is requested.
|
||||||
|
if self.full_backup {
|
||||||
|
for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? {
|
||||||
|
self.add_rel(rel)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for xid in self.timeline.list_twophase_files(self.lsn)? {
|
for xid in self.timeline.list_twophase_files(self.lsn)? {
|
||||||
self.add_twophase_file(xid)?;
|
self.add_twophase_file(xid)?;
|
||||||
@@ -157,6 +170,38 @@ where
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
|
||||||
|
let nblocks = self.timeline.get_rel_size(tag, self.lsn)?;
|
||||||
|
|
||||||
|
// Function that adds relation segment data to archive
|
||||||
|
let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
|
||||||
|
let file_name = tag.to_segfile_name(segment_index as u32);
|
||||||
|
let header = new_tar_header(&file_name, data.len() as u64)?;
|
||||||
|
self.ar.append(&header, data.as_slice())?;
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
|
||||||
|
// If the relation is empty, create an empty file
|
||||||
|
if nblocks == 0 {
|
||||||
|
add_file(0, &vec![])?;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a file for each chunk of blocks (aka segment)
|
||||||
|
let chunks = (0..nblocks).chunks(pg_constants::RELSEG_SIZE as usize);
|
||||||
|
for (seg, blocks) in chunks.into_iter().enumerate() {
|
||||||
|
let mut segment_data: Vec<u8> = vec![];
|
||||||
|
for blknum in blocks {
|
||||||
|
let img = self.timeline.get_rel_page_at_lsn(tag, blknum, self.lsn)?;
|
||||||
|
segment_data.extend_from_slice(&img[..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
add_file(seg, &segment_data)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Generate SLRU segment files from repository.
|
// Generate SLRU segment files from repository.
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".neon"));
|
||||||
let workdir = workdir
|
let workdir = workdir
|
||||||
.canonicalize()
|
.canonicalize()
|
||||||
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
|
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
|
||||||
@@ -263,6 +263,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
|||||||
// start profiler (if enabled)
|
// start profiler (if enabled)
|
||||||
let profiler_guard = profiling::init_profiler(conf);
|
let profiler_guard = profiling::init_profiler(conf);
|
||||||
|
|
||||||
|
pageserver::tenant_tasks::init_tenant_task_pool()?;
|
||||||
|
|
||||||
// initialize authentication for incoming connections
|
// initialize authentication for incoming connections
|
||||||
let auth = match &conf.auth_type {
|
let auth = match &conf.auth_type {
|
||||||
AuthType::Trust | AuthType::MD5 => None,
|
AuthType::Trust | AuthType::MD5 => None,
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||||
//! a zenith Timeline.
|
//! a zenith Timeline.
|
||||||
//!
|
//!
|
||||||
use std::fs;
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{Read, Seek, SeekFrom};
|
use std::io::{Read, Seek, SeekFrom};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
@@ -10,16 +9,18 @@ use std::path::{Path, PathBuf};
|
|||||||
use anyhow::{bail, ensure, Context, Result};
|
use anyhow::{bail, ensure, Context, Result};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
use crate::pgdatadir_mapping::*;
|
use crate::pgdatadir_mapping::*;
|
||||||
use crate::reltag::{RelTag, SlruKind};
|
use crate::reltag::{RelTag, SlruKind};
|
||||||
use crate::repository::Repository;
|
use crate::repository::Repository;
|
||||||
|
use crate::repository::Timeline;
|
||||||
use crate::walingest::WalIngest;
|
use crate::walingest::WalIngest;
|
||||||
use postgres_ffi::relfile_utils::*;
|
use postgres_ffi::relfile_utils::*;
|
||||||
use postgres_ffi::waldecoder::*;
|
use postgres_ffi::waldecoder::*;
|
||||||
use postgres_ffi::xlog_utils::*;
|
use postgres_ffi::xlog_utils::*;
|
||||||
|
use postgres_ffi::Oid;
|
||||||
use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
|
use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
|
||||||
use postgres_ffi::{Oid, TransactionId};
|
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -35,100 +36,29 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
|
|||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut pg_control: Option<ControlFileData> = None;
|
let mut pg_control: Option<ControlFileData> = None;
|
||||||
|
|
||||||
|
// TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn)
|
||||||
|
// Then fishing out pg_control would be unnecessary
|
||||||
let mut modification = tline.begin_modification(lsn);
|
let mut modification = tline.begin_modification(lsn);
|
||||||
modification.init_empty()?;
|
modification.init_empty()?;
|
||||||
|
|
||||||
// Scan 'global'
|
// Import all but pg_wal
|
||||||
let mut relfiles: Vec<PathBuf> = Vec::new();
|
let all_but_wal = WalkDir::new(path)
|
||||||
for direntry in fs::read_dir(path.join("global"))? {
|
.into_iter()
|
||||||
let direntry = direntry?;
|
.filter_entry(|entry| !entry.path().ends_with("pg_wal"));
|
||||||
match direntry.file_name().to_str() {
|
for entry in all_but_wal {
|
||||||
None => continue,
|
let entry = entry?;
|
||||||
|
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
||||||
|
if metadata.is_file() {
|
||||||
|
let absolute_path = entry.path();
|
||||||
|
let relative_path = absolute_path.strip_prefix(path)?;
|
||||||
|
|
||||||
Some("pg_control") => {
|
let file = File::open(absolute_path)?;
|
||||||
pg_control = Some(import_control_file(&mut modification, &direntry.path())?);
|
let len = metadata.len() as usize;
|
||||||
}
|
if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
|
||||||
Some("pg_filenode.map") => {
|
pg_control = Some(control_file);
|
||||||
import_relmap_file(
|
|
||||||
&mut modification,
|
|
||||||
pg_constants::GLOBALTABLESPACE_OID,
|
|
||||||
0,
|
|
||||||
&direntry.path(),
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Load any relation files into the page server (but only after the other files)
|
|
||||||
_ => relfiles.push(direntry.path()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for relfile in relfiles {
|
|
||||||
import_relfile(
|
|
||||||
&mut modification,
|
|
||||||
&relfile,
|
|
||||||
pg_constants::GLOBALTABLESPACE_OID,
|
|
||||||
0,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Scan 'base'. It contains database dirs, the database OID is the filename.
|
|
||||||
// E.g. 'base/12345', where 12345 is the database OID.
|
|
||||||
for direntry in fs::read_dir(path.join("base"))? {
|
|
||||||
let direntry = direntry?;
|
|
||||||
|
|
||||||
//skip all temporary files
|
|
||||||
if direntry.file_name().to_string_lossy() == "pgsql_tmp" {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let dboid = direntry.file_name().to_string_lossy().parse::<u32>()?;
|
|
||||||
|
|
||||||
let mut relfiles: Vec<PathBuf> = Vec::new();
|
|
||||||
for direntry in fs::read_dir(direntry.path())? {
|
|
||||||
let direntry = direntry?;
|
|
||||||
match direntry.file_name().to_str() {
|
|
||||||
None => continue,
|
|
||||||
|
|
||||||
Some("PG_VERSION") => {
|
|
||||||
//modification.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?;
|
|
||||||
}
|
|
||||||
Some("pg_filenode.map") => import_relmap_file(
|
|
||||||
&mut modification,
|
|
||||||
pg_constants::DEFAULTTABLESPACE_OID,
|
|
||||||
dboid,
|
|
||||||
&direntry.path(),
|
|
||||||
)?,
|
|
||||||
|
|
||||||
// Load any relation files into the page server
|
|
||||||
_ => relfiles.push(direntry.path()),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for relfile in relfiles {
|
|
||||||
import_relfile(
|
|
||||||
&mut modification,
|
|
||||||
&relfile,
|
|
||||||
pg_constants::DEFAULTTABLESPACE_OID,
|
|
||||||
dboid,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
for entry in fs::read_dir(path.join("pg_xact"))? {
|
|
||||||
let entry = entry?;
|
|
||||||
import_slru_file(&mut modification, SlruKind::Clog, &entry.path())?;
|
|
||||||
}
|
|
||||||
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
|
|
||||||
let entry = entry?;
|
|
||||||
import_slru_file(&mut modification, SlruKind::MultiXactMembers, &entry.path())?;
|
|
||||||
}
|
|
||||||
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
|
|
||||||
let entry = entry?;
|
|
||||||
import_slru_file(&mut modification, SlruKind::MultiXactOffsets, &entry.path())?;
|
|
||||||
}
|
|
||||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
|
||||||
let entry = entry?;
|
|
||||||
let xid = u32::from_str_radix(&entry.path().to_string_lossy(), 16)?;
|
|
||||||
import_twophase_file(&mut modification, xid, &entry.path())?;
|
|
||||||
}
|
|
||||||
// TODO: Scan pg_tblspc
|
|
||||||
|
|
||||||
// We're done importing all the data files.
|
// We're done importing all the data files.
|
||||||
modification.commit()?;
|
modification.commit()?;
|
||||||
@@ -158,31 +88,30 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||||
fn import_relfile<R: Repository>(
|
fn import_rel<R: Repository, Reader: Read>(
|
||||||
modification: &mut DatadirModification<R>,
|
modification: &mut DatadirModification<R>,
|
||||||
path: &Path,
|
path: &Path,
|
||||||
spcoid: Oid,
|
spcoid: Oid,
|
||||||
dboid: Oid,
|
dboid: Oid,
|
||||||
|
mut reader: Reader,
|
||||||
|
len: usize,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Does it look like a relation file?
|
// Does it look like a relation file?
|
||||||
trace!("importing rel file {}", path.display());
|
trace!("importing rel file {}", path.display());
|
||||||
|
|
||||||
let (relnode, forknum, segno) = parse_relfilename(&path.file_name().unwrap().to_string_lossy())
|
let filename = &path
|
||||||
.map_err(|e| {
|
.file_name()
|
||||||
warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
|
.expect("missing rel filename")
|
||||||
e
|
.to_string_lossy();
|
||||||
})?;
|
let (relnode, forknum, segno) = parse_relfilename(filename).map_err(|e| {
|
||||||
|
warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
|
||||||
let mut file = File::open(path)?;
|
|
||||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||||
|
|
||||||
let len = file.metadata().unwrap().len();
|
ensure!(len % pg_constants::BLCKSZ as usize == 0);
|
||||||
ensure!(len % pg_constants::BLCKSZ as u64 == 0);
|
let nblocks = len / pg_constants::BLCKSZ as usize;
|
||||||
let nblocks = len / pg_constants::BLCKSZ as u64;
|
|
||||||
|
|
||||||
if segno != 0 {
|
|
||||||
todo!();
|
|
||||||
}
|
|
||||||
|
|
||||||
let rel = RelTag {
|
let rel = RelTag {
|
||||||
spcnode: spcoid,
|
spcnode: spcoid,
|
||||||
@@ -190,11 +119,22 @@ fn import_relfile<R: Repository>(
|
|||||||
relnode,
|
relnode,
|
||||||
forknum,
|
forknum,
|
||||||
};
|
};
|
||||||
modification.put_rel_creation(rel, nblocks as u32)?;
|
|
||||||
|
|
||||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||||
|
|
||||||
|
// Call put_rel_creation for every segment of the relation,
|
||||||
|
// because there is no guarantee about the order in which we are processing segments.
|
||||||
|
// ignore "relation already exists" error
|
||||||
|
if let Err(e) = modification.put_rel_creation(rel, nblocks as u32) {
|
||||||
|
if e.to_string().contains("already exists") {
|
||||||
|
debug!("relation {} already exists. we must be extending it", rel);
|
||||||
|
} else {
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let r = file.read_exact(&mut buf);
|
let r = reader.read_exact(&mut buf);
|
||||||
match r {
|
match r {
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
||||||
@@ -204,7 +144,9 @@ fn import_relfile<R: Repository>(
|
|||||||
Err(err) => match err.kind() {
|
Err(err) => match err.kind() {
|
||||||
std::io::ErrorKind::UnexpectedEof => {
|
std::io::ErrorKind::UnexpectedEof => {
|
||||||
// reached EOF. That's expected.
|
// reached EOF. That's expected.
|
||||||
ensure!(blknum == nblocks as u32, "unexpected EOF");
|
let relative_blknum =
|
||||||
|
blknum - segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||||
|
ensure!(relative_blknum == nblocks as u32, "unexpected EOF");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
@@ -215,96 +157,43 @@ fn import_relfile<R: Repository>(
|
|||||||
blknum += 1;
|
blknum += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Update relation size
|
||||||
|
//
|
||||||
|
// If we process rel segments out of order,
|
||||||
|
// put_rel_extend will skip the update.
|
||||||
|
modification.put_rel_extend(rel, blknum)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Import a relmapper (pg_filenode.map) file into the repository
|
|
||||||
fn import_relmap_file<R: Repository>(
|
|
||||||
modification: &mut DatadirModification<R>,
|
|
||||||
spcnode: Oid,
|
|
||||||
dbnode: Oid,
|
|
||||||
path: &Path,
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut file = File::open(path)?;
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
// read the whole file
|
|
||||||
file.read_to_end(&mut buffer)?;
|
|
||||||
|
|
||||||
trace!("importing relmap file {}", path.display());
|
|
||||||
|
|
||||||
modification.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Import a twophase state file (pg_twophase/<xid>) into the repository
|
|
||||||
fn import_twophase_file<R: Repository>(
|
|
||||||
modification: &mut DatadirModification<R>,
|
|
||||||
xid: TransactionId,
|
|
||||||
path: &Path,
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut file = File::open(path)?;
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
// read the whole file
|
|
||||||
file.read_to_end(&mut buffer)?;
|
|
||||||
|
|
||||||
trace!("importing non-rel file {}", path.display());
|
|
||||||
|
|
||||||
modification.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
/// Import pg_control file into the repository.
|
|
||||||
///
|
|
||||||
/// The control file is imported as is, but we also extract the checkpoint record
|
|
||||||
/// from it and store it separated.
|
|
||||||
fn import_control_file<R: Repository>(
|
|
||||||
modification: &mut DatadirModification<R>,
|
|
||||||
path: &Path,
|
|
||||||
) -> Result<ControlFileData> {
|
|
||||||
let mut file = File::open(path)?;
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
// read the whole file
|
|
||||||
file.read_to_end(&mut buffer)?;
|
|
||||||
|
|
||||||
trace!("importing control file {}", path.display());
|
|
||||||
|
|
||||||
// Import it as ControlFile
|
|
||||||
modification.put_control_file(Bytes::copy_from_slice(&buffer[..]))?;
|
|
||||||
|
|
||||||
// Extract the checkpoint record and import it separately.
|
|
||||||
let pg_control = ControlFileData::decode(&buffer)?;
|
|
||||||
let checkpoint_bytes = pg_control.checkPointCopy.encode()?;
|
|
||||||
modification.put_checkpoint(checkpoint_bytes)?;
|
|
||||||
|
|
||||||
Ok(pg_control)
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
/// Import an SLRU segment file
|
/// Import an SLRU segment file
|
||||||
///
|
///
|
||||||
fn import_slru_file<R: Repository>(
|
fn import_slru<R: Repository, Reader: Read>(
|
||||||
modification: &mut DatadirModification<R>,
|
modification: &mut DatadirModification<R>,
|
||||||
slru: SlruKind,
|
slru: SlruKind,
|
||||||
path: &Path,
|
path: &Path,
|
||||||
|
mut reader: Reader,
|
||||||
|
len: usize,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
trace!("importing slru file {}", path.display());
|
trace!("importing slru file {}", path.display());
|
||||||
|
|
||||||
let mut file = File::open(path)?;
|
|
||||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||||
let segno = u32::from_str_radix(&path.file_name().unwrap().to_string_lossy(), 16)?;
|
let filename = &path
|
||||||
|
.file_name()
|
||||||
|
.expect("missing slru filename")
|
||||||
|
.to_string_lossy();
|
||||||
|
let segno = u32::from_str_radix(filename, 16)?;
|
||||||
|
|
||||||
let len = file.metadata().unwrap().len();
|
ensure!(len % pg_constants::BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ
|
||||||
ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ
|
let nblocks = len / pg_constants::BLCKSZ as usize;
|
||||||
let nblocks = len / pg_constants::BLCKSZ as u64;
|
|
||||||
|
|
||||||
ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64);
|
ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize);
|
||||||
|
|
||||||
modification.put_slru_segment_creation(slru, segno, nblocks as u32)?;
|
modification.put_slru_segment_creation(slru, segno, nblocks as u32)?;
|
||||||
|
|
||||||
let mut rpageno = 0;
|
let mut rpageno = 0;
|
||||||
loop {
|
loop {
|
||||||
let r = file.read_exact(&mut buf);
|
let r = reader.read_exact(&mut buf);
|
||||||
match r {
|
match r {
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
modification.put_slru_page_image(
|
modification.put_slru_page_image(
|
||||||
@@ -396,10 +285,258 @@ fn import_wal<R: Repository>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if last_lsn != startpoint {
|
if last_lsn != startpoint {
|
||||||
debug!("reached end of WAL at {}", last_lsn);
|
info!("reached end of WAL at {}", last_lsn);
|
||||||
} else {
|
} else {
|
||||||
info!("no WAL to import at {}", last_lsn);
|
info!("no WAL to import at {}", last_lsn);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn import_basebackup_from_tar<R: Repository, Reader: Read>(
|
||||||
|
tline: &mut DatadirTimeline<R>,
|
||||||
|
reader: Reader,
|
||||||
|
base_lsn: Lsn,
|
||||||
|
) -> Result<()> {
|
||||||
|
info!("importing base at {}", base_lsn);
|
||||||
|
let mut modification = tline.begin_modification(base_lsn);
|
||||||
|
modification.init_empty()?;
|
||||||
|
|
||||||
|
let mut pg_control: Option<ControlFileData> = None;
|
||||||
|
|
||||||
|
// Import base
|
||||||
|
for base_tar_entry in tar::Archive::new(reader).entries()? {
|
||||||
|
let entry = base_tar_entry?;
|
||||||
|
let header = entry.header();
|
||||||
|
let len = header.entry_size()? as usize;
|
||||||
|
let file_path = header.path()?.into_owned();
|
||||||
|
|
||||||
|
match header.entry_type() {
|
||||||
|
tar::EntryType::Regular => {
|
||||||
|
if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? {
|
||||||
|
// We found the pg_control file.
|
||||||
|
pg_control = Some(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tar::EntryType::Directory => {
|
||||||
|
debug!("directory {:?}", file_path);
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
panic!("tar::EntryType::?? {}", file_path.display());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sanity check: ensure that pg_control is loaded
|
||||||
|
let _pg_control = pg_control.context("pg_control file not found")?;
|
||||||
|
|
||||||
|
modification.commit()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn import_wal_from_tar<R: Repository, Reader: Read>(
|
||||||
|
tline: &mut DatadirTimeline<R>,
|
||||||
|
reader: Reader,
|
||||||
|
start_lsn: Lsn,
|
||||||
|
end_lsn: Lsn,
|
||||||
|
) -> Result<()> {
|
||||||
|
// Set up walingest mutable state
|
||||||
|
let mut waldecoder = WalStreamDecoder::new(start_lsn);
|
||||||
|
let mut segno = start_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||||
|
let mut offset = start_lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||||
|
let mut last_lsn = start_lsn;
|
||||||
|
let mut walingest = WalIngest::new(tline, start_lsn)?;
|
||||||
|
|
||||||
|
// Ingest wal until end_lsn
|
||||||
|
info!("importing wal until {}", end_lsn);
|
||||||
|
let mut pg_wal_tar = tar::Archive::new(reader);
|
||||||
|
let mut pg_wal_entries_iter = pg_wal_tar.entries()?;
|
||||||
|
while last_lsn <= end_lsn {
|
||||||
|
let bytes = {
|
||||||
|
let entry = pg_wal_entries_iter.next().expect("expected more wal")?;
|
||||||
|
let header = entry.header();
|
||||||
|
let file_path = header.path()?.into_owned();
|
||||||
|
|
||||||
|
match header.entry_type() {
|
||||||
|
tar::EntryType::Regular => {
|
||||||
|
// FIXME: assume postgresql tli 1 for now
|
||||||
|
let expected_filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||||
|
let file_name = file_path
|
||||||
|
.file_name()
|
||||||
|
.expect("missing wal filename")
|
||||||
|
.to_string_lossy();
|
||||||
|
ensure!(expected_filename == file_name);
|
||||||
|
|
||||||
|
debug!("processing wal file {:?}", file_path);
|
||||||
|
read_all_bytes(entry)?
|
||||||
|
}
|
||||||
|
tar::EntryType::Directory => {
|
||||||
|
debug!("directory {:?}", file_path);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
panic!("tar::EntryType::?? {}", file_path.display());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
waldecoder.feed_bytes(&bytes[offset..]);
|
||||||
|
|
||||||
|
while last_lsn <= end_lsn {
|
||||||
|
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||||
|
walingest.ingest_record(tline, recdata, lsn)?;
|
||||||
|
last_lsn = lsn;
|
||||||
|
|
||||||
|
debug!("imported record at {} (end {})", lsn, end_lsn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("imported records up to {}", last_lsn);
|
||||||
|
segno += 1;
|
||||||
|
offset = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if last_lsn != start_lsn {
|
||||||
|
info!("reached end of WAL at {}", last_lsn);
|
||||||
|
} else {
|
||||||
|
info!("there was no WAL to import at {}", last_lsn);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log any extra unused files
|
||||||
|
for e in &mut pg_wal_entries_iter {
|
||||||
|
let entry = e?;
|
||||||
|
let header = entry.header();
|
||||||
|
let file_path = header.path()?.into_owned();
|
||||||
|
info!("skipping {:?}", file_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn import_file<R: Repository, Reader: Read>(
|
||||||
|
modification: &mut DatadirModification<R>,
|
||||||
|
file_path: &Path,
|
||||||
|
reader: Reader,
|
||||||
|
len: usize,
|
||||||
|
) -> Result<Option<ControlFileData>> {
|
||||||
|
debug!("looking at {:?}", file_path);
|
||||||
|
|
||||||
|
if file_path.starts_with("global") {
|
||||||
|
let spcnode = pg_constants::GLOBALTABLESPACE_OID;
|
||||||
|
let dbnode = 0;
|
||||||
|
|
||||||
|
match file_path
|
||||||
|
.file_name()
|
||||||
|
.expect("missing filename")
|
||||||
|
.to_string_lossy()
|
||||||
|
.as_ref()
|
||||||
|
{
|
||||||
|
"pg_control" => {
|
||||||
|
let bytes = read_all_bytes(reader)?;
|
||||||
|
|
||||||
|
// Extract the checkpoint record and import it separately.
|
||||||
|
let pg_control = ControlFileData::decode(&bytes[..])?;
|
||||||
|
let checkpoint_bytes = pg_control.checkPointCopy.encode()?;
|
||||||
|
modification.put_checkpoint(checkpoint_bytes)?;
|
||||||
|
debug!("imported control file");
|
||||||
|
|
||||||
|
// Import it as ControlFile
|
||||||
|
modification.put_control_file(bytes)?;
|
||||||
|
return Ok(Some(pg_control));
|
||||||
|
}
|
||||||
|
"pg_filenode.map" => {
|
||||||
|
let bytes = read_all_bytes(reader)?;
|
||||||
|
modification.put_relmap_file(spcnode, dbnode, bytes)?;
|
||||||
|
debug!("imported relmap file")
|
||||||
|
}
|
||||||
|
"PG_VERSION" => {
|
||||||
|
debug!("ignored");
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
|
||||||
|
debug!("imported rel creation");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if file_path.starts_with("base") {
|
||||||
|
let spcnode = pg_constants::DEFAULTTABLESPACE_OID;
|
||||||
|
let dbnode: u32 = file_path
|
||||||
|
.iter()
|
||||||
|
.nth(1)
|
||||||
|
.expect("invalid file path, expected dbnode")
|
||||||
|
.to_string_lossy()
|
||||||
|
.parse()?;
|
||||||
|
|
||||||
|
match file_path
|
||||||
|
.file_name()
|
||||||
|
.expect("missing base filename")
|
||||||
|
.to_string_lossy()
|
||||||
|
.as_ref()
|
||||||
|
{
|
||||||
|
"pg_filenode.map" => {
|
||||||
|
let bytes = read_all_bytes(reader)?;
|
||||||
|
modification.put_relmap_file(spcnode, dbnode, bytes)?;
|
||||||
|
debug!("imported relmap file")
|
||||||
|
}
|
||||||
|
"PG_VERSION" => {
|
||||||
|
debug!("ignored");
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
|
||||||
|
debug!("imported rel creation");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if file_path.starts_with("pg_xact") {
|
||||||
|
let slru = SlruKind::Clog;
|
||||||
|
|
||||||
|
import_slru(modification, slru, file_path, reader, len)?;
|
||||||
|
debug!("imported clog slru");
|
||||||
|
} else if file_path.starts_with("pg_multixact/offsets") {
|
||||||
|
let slru = SlruKind::MultiXactOffsets;
|
||||||
|
|
||||||
|
import_slru(modification, slru, file_path, reader, len)?;
|
||||||
|
debug!("imported multixact offsets slru");
|
||||||
|
} else if file_path.starts_with("pg_multixact/members") {
|
||||||
|
let slru = SlruKind::MultiXactMembers;
|
||||||
|
|
||||||
|
import_slru(modification, slru, file_path, reader, len)?;
|
||||||
|
debug!("imported multixact members slru");
|
||||||
|
} else if file_path.starts_with("pg_twophase") {
|
||||||
|
let file_name = &file_path
|
||||||
|
.file_name()
|
||||||
|
.expect("missing twophase filename")
|
||||||
|
.to_string_lossy();
|
||||||
|
let xid = u32::from_str_radix(file_name, 16)?;
|
||||||
|
|
||||||
|
let bytes = read_all_bytes(reader)?;
|
||||||
|
modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;
|
||||||
|
debug!("imported twophase file");
|
||||||
|
} else if file_path.starts_with("pg_wal") {
|
||||||
|
debug!("found wal file in base section. ignore it");
|
||||||
|
} else if file_path.starts_with("zenith.signal") {
|
||||||
|
// Parse zenith signal file to set correct previous LSN
|
||||||
|
let bytes = read_all_bytes(reader)?;
|
||||||
|
// zenith.signal format is "PREV LSN: prev_lsn"
|
||||||
|
let zenith_signal = std::str::from_utf8(&bytes)?;
|
||||||
|
let zenith_signal = zenith_signal.split(':').collect::<Vec<_>>();
|
||||||
|
let prev_lsn = zenith_signal[1].trim().parse::<Lsn>()?;
|
||||||
|
|
||||||
|
let writer = modification.tline.tline.writer();
|
||||||
|
writer.finish_write(prev_lsn);
|
||||||
|
|
||||||
|
debug!("imported zenith signal {}", prev_lsn);
|
||||||
|
} else if file_path.starts_with("pg_tblspc") {
|
||||||
|
// TODO Backups exported from neon won't have pg_tblspc, but we will need
|
||||||
|
// this to import arbitrary postgres databases.
|
||||||
|
bail!("Importing pg_tblspc is not implemented");
|
||||||
|
} else {
|
||||||
|
debug!("ignored");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_all_bytes<Reader: Read>(mut reader: Reader) -> Result<Bytes> {
|
||||||
|
let mut buf: Vec<u8> = vec![];
|
||||||
|
reader.read_to_end(&mut buf)?;
|
||||||
|
Ok(Bytes::copy_from_slice(&buf[..]))
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
//! The functions here are responsible for locating the correct layer for the
|
//! The functions here are responsible for locating the correct layer for the
|
||||||
//! get/put call, tracing timeline branching history as needed.
|
//! get/put call, tracing timeline branching history as needed.
|
||||||
//!
|
//!
|
||||||
//! The files are stored in the .zenith/tenants/<tenantid>/timelines/<timelineid>
|
//! The files are stored in the .neon/tenants/<tenantid>/timelines/<timelineid>
|
||||||
//! directory. See layered_repository/README for how the files are managed.
|
//! directory. See layered_repository/README for how the files are managed.
|
||||||
//! In addition to the layer files, there is a metadata file in the same
|
//! In addition to the layer files, there is a metadata file in the same
|
||||||
//! directory that contains information about the timeline, in particular its
|
//! directory that contains information about the timeline, in particular its
|
||||||
@@ -148,7 +148,7 @@ lazy_static! {
|
|||||||
.expect("failed to define a metric");
|
.expect("failed to define a metric");
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parts of the `.zenith/tenants/<tenantid>/timelines/<timelineid>` directory prefix.
|
/// Parts of the `.neon/tenants/<tenantid>/timelines/<timelineid>` directory prefix.
|
||||||
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -158,6 +158,18 @@ pub struct LayeredRepository {
|
|||||||
// Global pageserver config parameters
|
// Global pageserver config parameters
|
||||||
pub conf: &'static PageServerConf,
|
pub conf: &'static PageServerConf,
|
||||||
|
|
||||||
|
// Allows us to gracefully cancel operations that edit the directory
|
||||||
|
// that backs this layered repository. Usage:
|
||||||
|
//
|
||||||
|
// Use `let _guard = file_lock.try_read()` while writing any files.
|
||||||
|
// Use `let _guard = file_lock.write().unwrap()` to wait for all writes to finish.
|
||||||
|
//
|
||||||
|
// TODO try_read this lock during checkpoint as well to prevent race
|
||||||
|
// between checkpoint and detach/delete.
|
||||||
|
// TODO try_read this lock for all gc/compaction operations, not just
|
||||||
|
// ones scheduled by the tenant task manager.
|
||||||
|
pub file_lock: RwLock<()>,
|
||||||
|
|
||||||
// Overridden tenant-specific config parameters.
|
// Overridden tenant-specific config parameters.
|
||||||
// We keep TenantConfOpt sturct here to preserve the information
|
// We keep TenantConfOpt sturct here to preserve the information
|
||||||
// about parameters that are not set.
|
// about parameters that are not set.
|
||||||
@@ -243,15 +255,15 @@ impl Repository for LayeredRepository {
|
|||||||
);
|
);
|
||||||
timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn);
|
timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn);
|
||||||
|
|
||||||
|
// Insert if not exists
|
||||||
let timeline = Arc::new(timeline);
|
let timeline = Arc::new(timeline);
|
||||||
let r = timelines.insert(
|
match timelines.entry(timelineid) {
|
||||||
timelineid,
|
Entry::Occupied(_) => bail!("Timeline already exists"),
|
||||||
LayeredTimelineEntry::Loaded(Arc::clone(&timeline)),
|
Entry::Vacant(vacant) => {
|
||||||
);
|
vacant.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline)))
|
||||||
ensure!(
|
}
|
||||||
r.is_none(),
|
};
|
||||||
"assertion failure, inserted duplicate timeline"
|
|
||||||
);
|
|
||||||
Ok(timeline)
|
Ok(timeline)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -685,6 +697,7 @@ impl LayeredRepository {
|
|||||||
) -> LayeredRepository {
|
) -> LayeredRepository {
|
||||||
LayeredRepository {
|
LayeredRepository {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
|
file_lock: RwLock::new(()),
|
||||||
conf,
|
conf,
|
||||||
tenant_conf: Arc::new(RwLock::new(tenant_conf)),
|
tenant_conf: Arc::new(RwLock::new(tenant_conf)),
|
||||||
timelines: Mutex::new(HashMap::new()),
|
timelines: Mutex::new(HashMap::new()),
|
||||||
@@ -1910,15 +1923,28 @@ impl LayeredTimeline {
|
|||||||
} else {
|
} else {
|
||||||
Lsn(0)
|
Lsn(0)
|
||||||
};
|
};
|
||||||
|
// Let's consider an example:
|
||||||
|
//
|
||||||
|
// delta layer with LSN range 71-81
|
||||||
|
// delta layer with LSN range 81-91
|
||||||
|
// delta layer with LSN range 91-101
|
||||||
|
// image layer at LSN 100
|
||||||
|
//
|
||||||
|
// If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer,
|
||||||
|
// there's no need to create a new one. We check this case explicitly, to avoid passing
|
||||||
|
// a bogus range to count_deltas below, with start > end. It's even possible that there
|
||||||
|
// are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
|
||||||
|
// after we read last_record_lsn, which is passed here in the 'lsn' argument.
|
||||||
|
if img_lsn < lsn {
|
||||||
|
let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
|
||||||
|
|
||||||
let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
|
debug!(
|
||||||
|
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
||||||
debug!(
|
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
||||||
"range {}-{}, has {} deltas on this timeline",
|
);
|
||||||
img_range.start, img_range.end, num_deltas
|
if num_deltas >= self.get_image_creation_threshold() {
|
||||||
);
|
return Ok(true);
|
||||||
if num_deltas >= self.get_image_creation_threshold() {
|
}
|
||||||
return Ok(true);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2210,6 +2236,9 @@ impl LayeredTimeline {
|
|||||||
LsnForTimestamp::Past(lsn) => {
|
LsnForTimestamp::Past(lsn) => {
|
||||||
debug!("past({})", lsn);
|
debug!("past({})", lsn);
|
||||||
}
|
}
|
||||||
|
LsnForTimestamp::NoData(lsn) => {
|
||||||
|
debug!("nodata({})", lsn);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
|
debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and
|
|||||||
a range of LSNs (or a single LSN, in case of image layers). You can think of it
|
a range of LSNs (or a single LSN, in case of image layers). You can think of it
|
||||||
as a rectangle in the two-dimensional key-LSN space. The layer files for each
|
as a rectangle in the two-dimensional key-LSN space. The layer files for each
|
||||||
timeline are stored in the timeline's subdirectory under
|
timeline are stored in the timeline's subdirectory under
|
||||||
`.zenith/tenants/<tenantid>/timelines`.
|
`.neon/tenants/<tenantid>/timelines`.
|
||||||
|
|
||||||
There are two kind of layer files: images, and delta layers. An image file
|
There are two kind of layer files: images, and delta layers. An image file
|
||||||
contains a snapshot of all keys at a particular LSN, whereas a delta file
|
contains a snapshot of all keys at a particular LSN, whereas a delta file
|
||||||
@@ -178,7 +178,7 @@ version, and how branching and GC works is still valid.
|
|||||||
The full path of a delta file looks like this:
|
The full path of a delta file looks like this:
|
||||||
|
|
||||||
```
|
```
|
||||||
.zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
|
.neon/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
|
||||||
```
|
```
|
||||||
|
|
||||||
For simplicity, the examples below use a simplified notation for the
|
For simplicity, the examples below use a simplified notation for the
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ pub trait BlobCursor {
|
|||||||
) -> Result<(), std::io::Error>;
|
) -> Result<(), std::io::Error>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, R> BlobCursor for BlockCursor<R>
|
impl<R> BlobCursor for BlockCursor<R>
|
||||||
where
|
where
|
||||||
R: BlockReader,
|
R: BlockReader,
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -445,7 +445,10 @@ impl ImageLayerWriter {
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
info!("new image layer {}", path.display());
|
info!("new image layer {}", path.display());
|
||||||
let mut file = VirtualFile::create(&path)?;
|
let mut file = VirtualFile::open_with_options(
|
||||||
|
&path,
|
||||||
|
std::fs::OpenOptions::new().write(true).create_new(true),
|
||||||
|
)?;
|
||||||
// make room for the header block
|
// make room for the header block
|
||||||
file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
|
file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
|
||||||
let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);
|
let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ pub mod repository;
|
|||||||
pub mod storage_sync;
|
pub mod storage_sync;
|
||||||
pub mod tenant_config;
|
pub mod tenant_config;
|
||||||
pub mod tenant_mgr;
|
pub mod tenant_mgr;
|
||||||
pub mod tenant_threads;
|
pub mod tenant_tasks;
|
||||||
pub mod thread_mgr;
|
pub mod thread_mgr;
|
||||||
pub mod timelines;
|
pub mod timelines;
|
||||||
pub mod virtual_file;
|
pub mod virtual_file;
|
||||||
@@ -24,7 +24,6 @@ pub mod walredo;
|
|||||||
|
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
use utils::postgres_backend;
|
|
||||||
|
|
||||||
use crate::thread_mgr::ThreadKind;
|
use crate::thread_mgr::ThreadKind;
|
||||||
use metrics::{register_int_gauge_vec, IntGaugeVec};
|
use metrics::{register_int_gauge_vec, IntGaugeVec};
|
||||||
@@ -73,7 +72,6 @@ pub fn shutdown_pageserver(exit_code: i32) {
|
|||||||
thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);
|
thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);
|
||||||
|
|
||||||
// Shut down any page service threads.
|
// Shut down any page service threads.
|
||||||
postgres_backend::set_pgbackend_shutdown_requested();
|
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None);
|
thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None);
|
||||||
|
|
||||||
// Shut down all the tenants. This flushes everything to disk and kills
|
// Shut down all the tenants. This flushes everything to disk and kills
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ use anyhow::{bail, ensure, Context, Result};
|
|||||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use std::io;
|
use std::io::{self, Read};
|
||||||
use std::net::TcpListener;
|
use std::net::TcpListener;
|
||||||
use std::str;
|
use std::str;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
@@ -29,6 +29,8 @@ use utils::{
|
|||||||
|
|
||||||
use crate::basebackup;
|
use crate::basebackup;
|
||||||
use crate::config::{PageServerConf, ProfilingConfig};
|
use crate::config::{PageServerConf, ProfilingConfig};
|
||||||
|
use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
|
||||||
|
use crate::layered_repository::LayeredRepository;
|
||||||
use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp};
|
use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp};
|
||||||
use crate::profiling::profpoint_start;
|
use crate::profiling::profpoint_start;
|
||||||
use crate::reltag::RelTag;
|
use crate::reltag::RelTag;
|
||||||
@@ -200,6 +202,96 @@ impl PagestreamBeMessage {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Implements Read for the server side of CopyIn
|
||||||
|
struct CopyInReader<'a> {
|
||||||
|
pgb: &'a mut PostgresBackend,
|
||||||
|
|
||||||
|
/// Overflow buffer for bytes sent in CopyData messages
|
||||||
|
/// that the reader (caller of read) hasn't asked for yet.
|
||||||
|
/// TODO use BytesMut?
|
||||||
|
buf: Vec<u8>,
|
||||||
|
|
||||||
|
/// Bytes before `buf_begin` are considered as dropped.
|
||||||
|
/// This allows us to implement O(1) pop_front on Vec<u8>.
|
||||||
|
/// The Vec won't grow large because we only add to it
|
||||||
|
/// when it's empty.
|
||||||
|
buf_begin: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> CopyInReader<'a> {
|
||||||
|
// NOTE: pgb should be in copy in state already
|
||||||
|
fn new(pgb: &'a mut PostgresBackend) -> Self {
|
||||||
|
Self {
|
||||||
|
pgb,
|
||||||
|
buf: Vec::<_>::new(),
|
||||||
|
buf_begin: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Drop for CopyInReader<'a> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
// Finalize copy protocol so that self.pgb can be reused
|
||||||
|
// TODO instead, maybe take ownership of pgb and give it back at the end
|
||||||
|
let mut buf: Vec<u8> = vec![];
|
||||||
|
let _ = self.read_to_end(&mut buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Read for CopyInReader<'a> {
|
||||||
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||||
|
while !thread_mgr::is_shutdown_requested() {
|
||||||
|
// Return from buffer if nonempty
|
||||||
|
if self.buf_begin < self.buf.len() {
|
||||||
|
let bytes_to_read = std::cmp::min(buf.len(), self.buf.len() - self.buf_begin);
|
||||||
|
buf[..bytes_to_read].copy_from_slice(&self.buf[self.buf_begin..][..bytes_to_read]);
|
||||||
|
self.buf_begin += bytes_to_read;
|
||||||
|
return Ok(bytes_to_read);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete garbage
|
||||||
|
self.buf.clear();
|
||||||
|
self.buf_begin = 0;
|
||||||
|
|
||||||
|
// Wait for client to send CopyData bytes
|
||||||
|
match self.pgb.read_message() {
|
||||||
|
Ok(Some(message)) => {
|
||||||
|
let copy_data_bytes = match message {
|
||||||
|
FeMessage::CopyData(bytes) => bytes,
|
||||||
|
FeMessage::CopyDone => return Ok(0),
|
||||||
|
FeMessage::Sync => continue,
|
||||||
|
m => {
|
||||||
|
let msg = format!("unexpected message {:?}", m);
|
||||||
|
self.pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
|
||||||
|
return Err(io::Error::new(io::ErrorKind::Other, msg));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Return as much as we can, saving the rest in self.buf
|
||||||
|
let mut reader = copy_data_bytes.reader();
|
||||||
|
let bytes_read = reader.read(buf)?;
|
||||||
|
reader.read_to_end(&mut self.buf)?;
|
||||||
|
return Ok(bytes_read);
|
||||||
|
}
|
||||||
|
Ok(None) => {
|
||||||
|
let msg = "client closed connection";
|
||||||
|
self.pgb.write_message(&BeMessage::ErrorResponse(msg))?;
|
||||||
|
return Err(io::Error::new(io::ErrorKind::Other, msg));
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
if !is_socket_read_timed_out(&e) {
|
||||||
|
return Err(io::Error::new(io::ErrorKind::Other, e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shutting down
|
||||||
|
let msg = "Importer thread was shut down";
|
||||||
|
Err(io::Error::new(io::ErrorKind::Other, msg))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -370,6 +462,10 @@ impl PageServerHandler {
|
|||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
|
let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
|
||||||
|
|
||||||
|
// NOTE: pagerequests handler exits when connection is closed,
|
||||||
|
// so there is no need to reset the association
|
||||||
|
thread_mgr::associate_with(Some(tenantid), Some(timelineid));
|
||||||
|
|
||||||
// Check that the timeline exists
|
// Check that the timeline exists
|
||||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||||
.context("Cannot load local timeline")?;
|
.context("Cannot load local timeline")?;
|
||||||
@@ -443,6 +539,98 @@ impl PageServerHandler {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn handle_import_basebackup(
|
||||||
|
&self,
|
||||||
|
pgb: &mut PostgresBackend,
|
||||||
|
tenant_id: ZTenantId,
|
||||||
|
timeline_id: ZTimelineId,
|
||||||
|
base_lsn: Lsn,
|
||||||
|
_end_lsn: Lsn,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
thread_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||||
|
let _enter =
|
||||||
|
info_span!("import basebackup", timeline = %timeline_id, tenant = %tenant_id).entered();
|
||||||
|
|
||||||
|
// Create empty timeline
|
||||||
|
info!("creating new timeline");
|
||||||
|
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||||
|
let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?;
|
||||||
|
let repartition_distance = repo.get_checkpoint_distance();
|
||||||
|
let mut datadir_timeline =
|
||||||
|
DatadirTimeline::<LayeredRepository>::new(timeline, repartition_distance);
|
||||||
|
|
||||||
|
// TODO mark timeline as not ready until it reaches end_lsn.
|
||||||
|
// We might have some wal to import as well, and we should prevent compute
|
||||||
|
// from connecting before that and writing conflicting wal.
|
||||||
|
//
|
||||||
|
// This is not relevant for pageserver->pageserver migrations, since there's
|
||||||
|
// no wal to import. But should be fixed if we want to import from postgres.
|
||||||
|
|
||||||
|
// TODO leave clean state on error. For now you can use detach to clean
|
||||||
|
// up broken state from a failed import.
|
||||||
|
|
||||||
|
// Import basebackup provided via CopyData
|
||||||
|
info!("importing basebackup");
|
||||||
|
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||||
|
let reader = CopyInReader::new(pgb);
|
||||||
|
import_basebackup_from_tar(&mut datadir_timeline, reader, base_lsn)?;
|
||||||
|
|
||||||
|
// TODO check checksum
|
||||||
|
// Meanwhile you can verify client-side by taking fullbackup
|
||||||
|
// and checking that it matches in size with what was imported.
|
||||||
|
// It wouldn't work if base came from vanilla postgres though,
|
||||||
|
// since we discard some log files.
|
||||||
|
|
||||||
|
// Flush data to disk, then upload to s3
|
||||||
|
info!("flushing layers");
|
||||||
|
datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?;
|
||||||
|
|
||||||
|
info!("done");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_import_wal(
|
||||||
|
&self,
|
||||||
|
pgb: &mut PostgresBackend,
|
||||||
|
tenant_id: ZTenantId,
|
||||||
|
timeline_id: ZTimelineId,
|
||||||
|
start_lsn: Lsn,
|
||||||
|
end_lsn: Lsn,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
thread_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||||
|
let _enter =
|
||||||
|
info_span!("import wal", timeline = %timeline_id, tenant = %tenant_id).entered();
|
||||||
|
|
||||||
|
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||||
|
let timeline = repo.get_timeline_load(timeline_id)?;
|
||||||
|
ensure!(timeline.get_last_record_lsn() == start_lsn);
|
||||||
|
|
||||||
|
let repartition_distance = repo.get_checkpoint_distance();
|
||||||
|
let mut datadir_timeline =
|
||||||
|
DatadirTimeline::<LayeredRepository>::new(timeline, repartition_distance);
|
||||||
|
|
||||||
|
// TODO leave clean state on error. For now you can use detach to clean
|
||||||
|
// up broken state from a failed import.
|
||||||
|
|
||||||
|
// Import wal provided via CopyData
|
||||||
|
info!("importing wal");
|
||||||
|
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||||
|
let reader = CopyInReader::new(pgb);
|
||||||
|
import_wal_from_tar(&mut datadir_timeline, reader, start_lsn, end_lsn)?;
|
||||||
|
|
||||||
|
// TODO Does it make sense to overshoot?
|
||||||
|
ensure!(datadir_timeline.tline.get_last_record_lsn() >= end_lsn);
|
||||||
|
|
||||||
|
// Flush data to disk, then upload to s3. No need for a forced checkpoint.
|
||||||
|
// We only want to persist the data, and it doesn't matter if it's in the
|
||||||
|
// shape of deltas or images.
|
||||||
|
info!("flushing layers");
|
||||||
|
datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?;
|
||||||
|
|
||||||
|
info!("done");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Helper function to handle the LSN from client request.
|
/// Helper function to handle the LSN from client request.
|
||||||
///
|
///
|
||||||
/// Each GetPage (and Exists and Nblocks) request includes information about
|
/// Each GetPage (and Exists and Nblocks) request includes information about
|
||||||
@@ -545,17 +733,10 @@ impl PageServerHandler {
|
|||||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
||||||
|
|
||||||
let all_rels = timeline.list_rels(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?;
|
let total_blocks =
|
||||||
let mut total_blocks: i64 = 0;
|
timeline.get_db_size(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?;
|
||||||
|
|
||||||
for rel in all_rels {
|
let db_size = total_blocks as i64 * pg_constants::BLCKSZ as i64;
|
||||||
if rel.forknum == 0 {
|
|
||||||
let n_blocks = timeline.get_rel_size(rel, lsn).unwrap_or(0);
|
|
||||||
total_blocks += n_blocks as i64;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let db_size = total_blocks * pg_constants::BLCKSZ as i64;
|
|
||||||
|
|
||||||
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
|
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
|
||||||
db_size,
|
db_size,
|
||||||
@@ -592,6 +773,7 @@ impl PageServerHandler {
|
|||||||
timelineid: ZTimelineId,
|
timelineid: ZTimelineId,
|
||||||
lsn: Option<Lsn>,
|
lsn: Option<Lsn>,
|
||||||
tenantid: ZTenantId,
|
tenantid: ZTenantId,
|
||||||
|
full_backup: bool,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
|
let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
|
||||||
let _enter = span.enter();
|
let _enter = span.enter();
|
||||||
@@ -614,7 +796,7 @@ impl PageServerHandler {
|
|||||||
{
|
{
|
||||||
let mut writer = CopyDataSink { pgb };
|
let mut writer = CopyDataSink { pgb };
|
||||||
|
|
||||||
let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
|
let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn, full_backup)?;
|
||||||
span.record("lsn", &basebackup.lsn.to_string().as_str());
|
span.record("lsn", &basebackup.lsn.to_string().as_str());
|
||||||
basebackup.send_tarball()?;
|
basebackup.send_tarball()?;
|
||||||
}
|
}
|
||||||
@@ -672,6 +854,10 @@ impl postgres_backend::Handler for PageServerHandler {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_shutdown_requested(&self) -> bool {
|
||||||
|
thread_mgr::is_shutdown_requested()
|
||||||
|
}
|
||||||
|
|
||||||
fn process_query(
|
fn process_query(
|
||||||
&mut self,
|
&mut self,
|
||||||
pgb: &mut PostgresBackend,
|
pgb: &mut PostgresBackend,
|
||||||
@@ -713,8 +899,79 @@ impl postgres_backend::Handler for PageServerHandler {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Check that the timeline exists
|
// Check that the timeline exists
|
||||||
self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?;
|
self.handle_basebackup_request(pgb, timelineid, lsn, tenantid, false)?;
|
||||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
|
}
|
||||||
|
// same as basebackup, but result includes relational data as well
|
||||||
|
else if query_string.starts_with("fullbackup ") {
|
||||||
|
let (_, params_raw) = query_string.split_at("fullbackup ".len());
|
||||||
|
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||||
|
|
||||||
|
ensure!(
|
||||||
|
params.len() == 3,
|
||||||
|
"invalid param number for fullbackup command"
|
||||||
|
);
|
||||||
|
|
||||||
|
let tenantid = ZTenantId::from_str(params[0])?;
|
||||||
|
let timelineid = ZTimelineId::from_str(params[1])?;
|
||||||
|
|
||||||
|
self.check_permission(Some(tenantid))?;
|
||||||
|
|
||||||
|
// Lsn is required for fullbackup, because otherwise we would not know
|
||||||
|
// at which lsn to upload this backup.
|
||||||
|
//
|
||||||
|
// The caller is responsible for providing a valid lsn
|
||||||
|
// and using it in the subsequent import.
|
||||||
|
let lsn = Some(Lsn::from_str(params[2])?);
|
||||||
|
|
||||||
|
// Check that the timeline exists
|
||||||
|
self.handle_basebackup_request(pgb, timelineid, lsn, tenantid, true)?;
|
||||||
|
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
|
} else if query_string.starts_with("import basebackup ") {
|
||||||
|
// Import the `base` section (everything but the wal) of a basebackup.
|
||||||
|
// Assumes the tenant already exists on this pageserver.
|
||||||
|
//
|
||||||
|
// Files are scheduled to be persisted to remote storage, and the
|
||||||
|
// caller should poll the http api to check when that is done.
|
||||||
|
//
|
||||||
|
// Example import command:
|
||||||
|
// 1. Get start/end LSN from backup_manifest file
|
||||||
|
// 2. Run:
|
||||||
|
// cat my_backup/base.tar | psql -h $PAGESERVER \
|
||||||
|
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN"
|
||||||
|
let (_, params_raw) = query_string.split_at("import basebackup ".len());
|
||||||
|
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||||
|
ensure!(params.len() == 4);
|
||||||
|
let tenant = ZTenantId::from_str(params[0])?;
|
||||||
|
let timeline = ZTimelineId::from_str(params[1])?;
|
||||||
|
let base_lsn = Lsn::from_str(params[2])?;
|
||||||
|
let end_lsn = Lsn::from_str(params[3])?;
|
||||||
|
|
||||||
|
self.check_permission(Some(tenant))?;
|
||||||
|
|
||||||
|
match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) {
|
||||||
|
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||||
|
Err(e) => pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?,
|
||||||
|
};
|
||||||
|
} else if query_string.starts_with("import wal ") {
|
||||||
|
// Import the `pg_wal` section of a basebackup.
|
||||||
|
//
|
||||||
|
// Files are scheduled to be persisted to remote storage, and the
|
||||||
|
// caller should poll the http api to check when that is done.
|
||||||
|
let (_, params_raw) = query_string.split_at("import wal ".len());
|
||||||
|
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||||
|
ensure!(params.len() == 4);
|
||||||
|
let tenant = ZTenantId::from_str(params[0])?;
|
||||||
|
let timeline = ZTimelineId::from_str(params[1])?;
|
||||||
|
let start_lsn = Lsn::from_str(params[2])?;
|
||||||
|
let end_lsn = Lsn::from_str(params[3])?;
|
||||||
|
|
||||||
|
self.check_permission(Some(tenant))?;
|
||||||
|
|
||||||
|
match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) {
|
||||||
|
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||||
|
Err(e) => pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?,
|
||||||
|
};
|
||||||
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
||||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||||
// on connect
|
// on connect
|
||||||
@@ -802,7 +1059,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
|||||||
.map(|h| h.as_str().parse())
|
.map(|h| h.as_str().parse())
|
||||||
.unwrap_or_else(|| Ok(repo.get_gc_horizon()))?;
|
.unwrap_or_else(|| Ok(repo.get_gc_horizon()))?;
|
||||||
|
|
||||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
|
||||||
// Use tenant's pitr setting
|
// Use tenant's pitr setting
|
||||||
let pitr = repo.get_pitr_interval();
|
let pitr = repo.get_pitr_interval();
|
||||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?;
|
let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?;
|
||||||
@@ -895,6 +1151,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
|||||||
LsnForTimestamp::Present(lsn) => format!("{}", lsn),
|
LsnForTimestamp::Present(lsn) => format!("{}", lsn),
|
||||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||||
|
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||||
};
|
};
|
||||||
pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
|
pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
|
||||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ pub enum LsnForTimestamp {
|
|||||||
Present(Lsn),
|
Present(Lsn),
|
||||||
Future(Lsn),
|
Future(Lsn),
|
||||||
Past(Lsn),
|
Past(Lsn),
|
||||||
|
NoData(Lsn),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<R: Repository> DatadirTimeline<R> {
|
impl<R: Repository> DatadirTimeline<R> {
|
||||||
@@ -123,6 +124,19 @@ impl<R: Repository> DatadirTimeline<R> {
|
|||||||
self.tline.get(key, lsn)
|
self.tline.get(key, lsn)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get size of a database in blocks
|
||||||
|
pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<usize> {
|
||||||
|
let mut total_blocks = 0;
|
||||||
|
|
||||||
|
let rels = self.list_rels(spcnode, dbnode, lsn)?;
|
||||||
|
|
||||||
|
for rel in rels {
|
||||||
|
let n_blocks = self.get_rel_size(rel, lsn)?;
|
||||||
|
total_blocks += n_blocks as usize;
|
||||||
|
}
|
||||||
|
Ok(total_blocks)
|
||||||
|
}
|
||||||
|
|
||||||
/// Get size of a relation file
|
/// Get size of a relation file
|
||||||
pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
|
pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
|
||||||
ensure!(tag.relnode != 0, "invalid relnode");
|
ensure!(tag.relnode != 0, "invalid relnode");
|
||||||
@@ -250,7 +264,7 @@ impl<R: Repository> DatadirTimeline<R> {
|
|||||||
(false, false) => {
|
(false, false) => {
|
||||||
// This can happen if no commit records have been processed yet, e.g.
|
// This can happen if no commit records have been processed yet, e.g.
|
||||||
// just after importing a cluster.
|
// just after importing a cluster.
|
||||||
bail!("no commit timestamps found");
|
Ok(LsnForTimestamp::NoData(max_lsn))
|
||||||
}
|
}
|
||||||
(true, false) => {
|
(true, false) => {
|
||||||
// Didn't find any commit timestamps larger than the request
|
// Didn't find any commit timestamps larger than the request
|
||||||
@@ -667,6 +681,10 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> {
|
pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> {
|
||||||
|
let req_lsn = self.tline.get_last_record_lsn();
|
||||||
|
|
||||||
|
let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn)?;
|
||||||
|
|
||||||
// Remove entry from dbdir
|
// Remove entry from dbdir
|
||||||
let buf = self.get(DBDIR_KEY)?;
|
let buf = self.get(DBDIR_KEY)?;
|
||||||
let mut dir = DbDirectory::des(&buf)?;
|
let mut dir = DbDirectory::des(&buf)?;
|
||||||
@@ -680,7 +698,8 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: update pending_nblocks
|
// Update logical database size.
|
||||||
|
self.pending_nblocks -= total_blocks as isize;
|
||||||
|
|
||||||
// Delete all relations and metadata files for the spcnode/dnode
|
// Delete all relations and metadata files for the spcnode/dnode
|
||||||
self.delete(dbdir_key_range(spcnode, dbnode));
|
self.delete(dbdir_key_range(spcnode, dbnode));
|
||||||
@@ -749,6 +768,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Extend relation
|
/// Extend relation
|
||||||
|
/// If new size is smaller, do nothing.
|
||||||
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
|
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
|
||||||
ensure!(rel.relnode != 0, "invalid relnode");
|
ensure!(rel.relnode != 0, "invalid relnode");
|
||||||
|
|
||||||
@@ -756,10 +776,13 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
|||||||
let size_key = rel_size_to_key(rel);
|
let size_key = rel_size_to_key(rel);
|
||||||
let old_size = self.get(size_key)?.get_u32_le();
|
let old_size = self.get(size_key)?.get_u32_le();
|
||||||
|
|
||||||
let buf = nblocks.to_le_bytes();
|
// only extend relation here. never decrease the size
|
||||||
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
|
if nblocks > old_size {
|
||||||
|
let buf = nblocks.to_le_bytes();
|
||||||
|
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
|
||||||
|
|
||||||
self.pending_nblocks += nblocks as isize - old_size as isize;
|
self.pending_nblocks += nblocks as isize - old_size as isize;
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -81,6 +81,12 @@ mod profiling_impl {
|
|||||||
|
|
||||||
pub struct DummyProfilerGuard;
|
pub struct DummyProfilerGuard;
|
||||||
|
|
||||||
|
impl Drop for DummyProfilerGuard {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
// do nothing, this exists to calm Clippy down
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn profpoint_start(
|
pub fn profpoint_start(
|
||||||
_conf: &PageServerConf,
|
_conf: &PageServerConf,
|
||||||
_point: ProfilingConfig,
|
_point: ProfilingConfig,
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
|
|||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||||
use postgres_ffi::Oid;
|
use postgres_ffi::{pg_constants, Oid};
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Relation data file segment id throughout the Postgres cluster.
|
/// Relation data file segment id throughout the Postgres cluster.
|
||||||
@@ -75,6 +75,30 @@ impl fmt::Display for RelTag {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl RelTag {
|
||||||
|
pub fn to_segfile_name(&self, segno: u32) -> String {
|
||||||
|
let mut name = if self.spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||||
|
"global/".to_string()
|
||||||
|
} else {
|
||||||
|
format!("base/{}/", self.dbnode)
|
||||||
|
};
|
||||||
|
|
||||||
|
name += &self.relnode.to_string();
|
||||||
|
|
||||||
|
if let Some(fork_name) = forknumber_to_name(self.forknum) {
|
||||||
|
name += "_";
|
||||||
|
name += fork_name;
|
||||||
|
}
|
||||||
|
|
||||||
|
if segno != 0 {
|
||||||
|
name += ".";
|
||||||
|
name += &segno.to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and
|
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and
|
||||||
/// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer,
|
/// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer,
|
||||||
|
|||||||
@@ -197,7 +197,7 @@ impl Display for TimelineSyncStatusUpdate {
|
|||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// A repository corresponds to one .zenith directory. One repository holds multiple
|
/// A repository corresponds to one .neon directory. One repository holds multiple
|
||||||
/// timelines, forked off from the same initial call to 'initdb'.
|
/// timelines, forked off from the same initial call to 'initdb'.
|
||||||
pub trait Repository: Send + Sync {
|
pub trait Repository: Send + Sync {
|
||||||
type Timeline: Timeline;
|
type Timeline: Timeline;
|
||||||
|
|||||||
@@ -186,8 +186,8 @@ use crate::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use metrics::{
|
use metrics::{
|
||||||
register_histogram_vec, register_int_counter, register_int_gauge, HistogramVec, IntCounter,
|
register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge,
|
||||||
IntGauge,
|
HistogramVec, IntCounter, IntCounterVec, IntGauge,
|
||||||
};
|
};
|
||||||
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||||
|
|
||||||
@@ -208,14 +208,17 @@ lazy_static! {
|
|||||||
static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!(
|
static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!(
|
||||||
"pageserver_remote_storage_image_sync_seconds",
|
"pageserver_remote_storage_image_sync_seconds",
|
||||||
"Time took to synchronize (download or upload) a whole pageserver image. \
|
"Time took to synchronize (download or upload) a whole pageserver image. \
|
||||||
Grouped by `operation_kind` (upload|download) and `status` (success|failure)",
|
Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)",
|
||||||
&["operation_kind", "status"],
|
&["tenant_id", "timeline_id", "operation_kind", "status"],
|
||||||
vec![
|
vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
|
||||||
0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 6.0, 7.0,
|
|
||||||
8.0, 9.0, 10.0, 12.5, 15.0, 17.5, 20.0
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
.expect("failed to register pageserver image sync time histogram vec");
|
.expect("failed to register pageserver image sync time histogram vec");
|
||||||
|
static ref REMOTE_INDEX_UPLOAD: IntCounterVec = register_int_counter_vec!(
|
||||||
|
"pageserver_remote_storage_remote_index_uploads_total",
|
||||||
|
"Number of remote index uploads",
|
||||||
|
&["tenant_id", "timeline_id"],
|
||||||
|
)
|
||||||
|
.expect("failed to register pageserver remote index upload vec");
|
||||||
}
|
}
|
||||||
|
|
||||||
static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();
|
static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();
|
||||||
@@ -1146,19 +1149,19 @@ where
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
DownloadedTimeline::Abort => {
|
DownloadedTimeline::Abort => {
|
||||||
register_sync_status(sync_start, task_name, None);
|
register_sync_status(sync_id, sync_start, task_name, None);
|
||||||
if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) {
|
if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) {
|
||||||
error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}");
|
error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
DownloadedTimeline::FailedAndRescheduled => {
|
DownloadedTimeline::FailedAndRescheduled => {
|
||||||
register_sync_status(sync_start, task_name, Some(false));
|
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||||
}
|
}
|
||||||
DownloadedTimeline::Successful(mut download_data) => {
|
DownloadedTimeline::Successful(mut download_data) => {
|
||||||
match update_local_metadata(conf, sync_id, current_remote_timeline).await {
|
match update_local_metadata(conf, sync_id, current_remote_timeline).await {
|
||||||
Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
|
Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
register_sync_status(sync_start, task_name, Some(true));
|
register_sync_status(sync_id, sync_start, task_name, Some(true));
|
||||||
return Some(TimelineSyncStatusUpdate::Downloaded);
|
return Some(TimelineSyncStatusUpdate::Downloaded);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
@@ -1169,7 +1172,7 @@ where
|
|||||||
error!("Failed to update local timeline metadata: {e:?}");
|
error!("Failed to update local timeline metadata: {e:?}");
|
||||||
download_data.retries += 1;
|
download_data.retries += 1;
|
||||||
sync_queue.push(sync_id, SyncTask::Download(download_data));
|
sync_queue.push(sync_id, SyncTask::Download(download_data));
|
||||||
register_sync_status(sync_start, task_name, Some(false));
|
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1265,14 +1268,14 @@ async fn delete_timeline_data<P, S>(
|
|||||||
error!("Failed to update remote timeline {sync_id}: {e:?}");
|
error!("Failed to update remote timeline {sync_id}: {e:?}");
|
||||||
new_delete_data.retries += 1;
|
new_delete_data.retries += 1;
|
||||||
sync_queue.push(sync_id, SyncTask::Delete(new_delete_data));
|
sync_queue.push(sync_id, SyncTask::Delete(new_delete_data));
|
||||||
register_sync_status(sync_start, task_name, Some(false));
|
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
timeline_delete.deletion_registered = true;
|
timeline_delete.deletion_registered = true;
|
||||||
|
|
||||||
let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await;
|
let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await;
|
||||||
register_sync_status(sync_start, task_name, Some(sync_status));
|
register_sync_status(sync_id, sync_start, task_name, Some(sync_status));
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result<TimelineMetadata> {
|
async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result<TimelineMetadata> {
|
||||||
@@ -1306,7 +1309,7 @@ async fn upload_timeline_data<P, S>(
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
UploadedTimeline::FailedAndRescheduled => {
|
UploadedTimeline::FailedAndRescheduled => {
|
||||||
register_sync_status(sync_start, task_name, Some(false));
|
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
UploadedTimeline::Successful(upload_data) => upload_data,
|
UploadedTimeline::Successful(upload_data) => upload_data,
|
||||||
@@ -1325,13 +1328,13 @@ async fn upload_timeline_data<P, S>(
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
register_sync_status(sync_start, task_name, Some(true));
|
register_sync_status(sync_id, sync_start, task_name, Some(true));
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Failed to update remote timeline {sync_id}: {e:?}");
|
error!("Failed to update remote timeline {sync_id}: {e:?}");
|
||||||
uploaded_data.retries += 1;
|
uploaded_data.retries += 1;
|
||||||
sync_queue.push(sync_id, SyncTask::Upload(uploaded_data));
|
sync_queue.push(sync_id, SyncTask::Upload(uploaded_data));
|
||||||
register_sync_status(sync_start, task_name, Some(false));
|
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1421,7 +1424,14 @@ where
|
|||||||
IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline)
|
IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline)
|
||||||
.context("Failed to create an index part from the updated remote timeline")?;
|
.context("Failed to create an index part from the updated remote timeline")?;
|
||||||
|
|
||||||
info!("Uploading remote index for the timeline");
|
debug!("Uploading remote index for the timeline");
|
||||||
|
REMOTE_INDEX_UPLOAD
|
||||||
|
.with_label_values(&[
|
||||||
|
&sync_id.tenant_id.to_string(),
|
||||||
|
&sync_id.timeline_id.to_string(),
|
||||||
|
])
|
||||||
|
.inc();
|
||||||
|
|
||||||
upload_index_part(conf, storage, sync_id, new_index_part)
|
upload_index_part(conf, storage, sync_id, new_index_part)
|
||||||
.await
|
.await
|
||||||
.context("Failed to upload new index part")
|
.context("Failed to upload new index part")
|
||||||
@@ -1590,12 +1600,24 @@ fn compare_local_and_remote_timeline(
|
|||||||
(initial_timeline_status, awaits_download)
|
(initial_timeline_status, awaits_download)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Option<bool>) {
|
fn register_sync_status(
|
||||||
|
sync_id: ZTenantTimelineId,
|
||||||
|
sync_start: Instant,
|
||||||
|
sync_name: &str,
|
||||||
|
sync_status: Option<bool>,
|
||||||
|
) {
|
||||||
let secs_elapsed = sync_start.elapsed().as_secs_f64();
|
let secs_elapsed = sync_start.elapsed().as_secs_f64();
|
||||||
info!("Processed a sync task in {secs_elapsed:.2} seconds");
|
debug!("Processed a sync task in {secs_elapsed:.2} seconds");
|
||||||
|
|
||||||
|
let tenant_id = sync_id.tenant_id.to_string();
|
||||||
|
let timeline_id = sync_id.timeline_id.to_string();
|
||||||
match sync_status {
|
match sync_status {
|
||||||
Some(true) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "success"]),
|
Some(true) => {
|
||||||
Some(false) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "failure"]),
|
IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "success"])
|
||||||
|
}
|
||||||
|
Some(false) => {
|
||||||
|
IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "failure"])
|
||||||
|
}
|
||||||
None => return,
|
None => return,
|
||||||
}
|
}
|
||||||
.observe(secs_elapsed)
|
.observe(secs_elapsed)
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ use std::{fmt::Debug, path::PathBuf};
|
|||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use futures::stream::{FuturesUnordered, StreamExt};
|
use futures::stream::{FuturesUnordered, StreamExt};
|
||||||
|
use lazy_static::lazy_static;
|
||||||
use remote_storage::RemoteStorage;
|
use remote_storage::RemoteStorage;
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
use tracing::{debug, error, info, warn};
|
use tracing::{debug, error, info, warn};
|
||||||
@@ -17,6 +18,16 @@ use super::{
|
|||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
|
config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
|
||||||
};
|
};
|
||||||
|
use metrics::{register_int_counter_vec, IntCounterVec};
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref NO_LAYERS_UPLOAD: IntCounterVec = register_int_counter_vec!(
|
||||||
|
"pageserver_remote_storage_no_layers_uploads_total",
|
||||||
|
"Number of skipped uploads due to no layers",
|
||||||
|
&["tenant_id", "timeline_id"],
|
||||||
|
)
|
||||||
|
.expect("failed to register pageserver no layers upload vec");
|
||||||
|
}
|
||||||
|
|
||||||
/// Serializes and uploads the given index part data to the remote storage.
|
/// Serializes and uploads the given index part data to the remote storage.
|
||||||
pub(super) async fn upload_index_part<P, S>(
|
pub(super) async fn upload_index_part<P, S>(
|
||||||
@@ -102,7 +113,13 @@ where
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
if layers_to_upload.is_empty() {
|
if layers_to_upload.is_empty() {
|
||||||
info!("No layers to upload after filtering, aborting");
|
debug!("No layers to upload after filtering, aborting");
|
||||||
|
NO_LAYERS_UPLOAD
|
||||||
|
.with_label_values(&[
|
||||||
|
&sync_id.tenant_id.to_string(),
|
||||||
|
&sync_id.timeline_id.to_string(),
|
||||||
|
])
|
||||||
|
.inc();
|
||||||
return UploadedTimeline::Successful(upload_data);
|
return UploadedTimeline::Successful(upload_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ pub mod defaults {
|
|||||||
pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
|
pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
|
||||||
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
|
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
|
||||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
||||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10_000;
|
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Per-tenant configuration options
|
/// Per-tenant configuration options
|
||||||
|
|||||||
@@ -230,8 +230,6 @@ pub fn shutdown_all_tenants() {
|
|||||||
drop(m);
|
drop(m);
|
||||||
|
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None);
|
thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None);
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None);
|
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None);
|
|
||||||
|
|
||||||
// Ok, no background threads running anymore. Flush any remaining data in
|
// Ok, no background threads running anymore. Flush any remaining data in
|
||||||
// memory to disk.
|
// memory to disk.
|
||||||
@@ -330,44 +328,12 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
|
|||||||
}
|
}
|
||||||
(TenantState::Idle, TenantState::Active) => {
|
(TenantState::Idle, TenantState::Active) => {
|
||||||
info!("activating tenant {tenant_id}");
|
info!("activating tenant {tenant_id}");
|
||||||
let compactor_spawn_result = thread_mgr::spawn(
|
|
||||||
ThreadKind::Compactor,
|
|
||||||
Some(tenant_id),
|
|
||||||
None,
|
|
||||||
"Compactor thread",
|
|
||||||
false,
|
|
||||||
move || crate::tenant_threads::compact_loop(tenant_id),
|
|
||||||
);
|
|
||||||
if compactor_spawn_result.is_err() {
|
|
||||||
let mut m = tenants_state::write_tenants();
|
|
||||||
m.get_mut(&tenant_id)
|
|
||||||
.with_context(|| format!("Tenant not found for id {tenant_id}"))?
|
|
||||||
.state = old_state;
|
|
||||||
drop(m);
|
|
||||||
}
|
|
||||||
compactor_spawn_result?;
|
|
||||||
|
|
||||||
let gc_spawn_result = thread_mgr::spawn(
|
// Spawn gc and compaction loops. The loops will shut themselves
|
||||||
ThreadKind::GarbageCollector,
|
// down when they notice that the tenant is inactive.
|
||||||
Some(tenant_id),
|
// TODO maybe use tokio::sync::watch instead?
|
||||||
None,
|
crate::tenant_tasks::start_compaction_loop(tenant_id)?;
|
||||||
"GC thread",
|
crate::tenant_tasks::start_gc_loop(tenant_id)?;
|
||||||
false,
|
|
||||||
move || crate::tenant_threads::gc_loop(tenant_id),
|
|
||||||
)
|
|
||||||
.map(|_thread_id| ()) // update the `Result::Ok` type to match the outer function's return signature
|
|
||||||
.with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}"));
|
|
||||||
|
|
||||||
if let Err(e) = &gc_spawn_result {
|
|
||||||
let mut m = tenants_state::write_tenants();
|
|
||||||
m.get_mut(&tenant_id)
|
|
||||||
.with_context(|| format!("Tenant not found for id {tenant_id}"))?
|
|
||||||
.state = old_state;
|
|
||||||
drop(m);
|
|
||||||
error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}");
|
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
|
|
||||||
return gc_spawn_result;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
(TenantState::Idle, TenantState::Stopping) => {
|
(TenantState::Idle, TenantState::Stopping) => {
|
||||||
info!("stopping idle tenant {tenant_id}");
|
info!("stopping idle tenant {tenant_id}");
|
||||||
@@ -379,8 +345,10 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
|
|||||||
Some(tenant_id),
|
Some(tenant_id),
|
||||||
None,
|
None,
|
||||||
);
|
);
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), Some(tenant_id), None);
|
|
||||||
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
|
// Wait until all gc/compaction tasks finish
|
||||||
|
let repo = get_repository_for_tenant(tenant_id)?;
|
||||||
|
let _guard = repo.file_lock.write().unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
288
pageserver/src/tenant_tasks.rs
Normal file
288
pageserver/src/tenant_tasks.rs
Normal file
@@ -0,0 +1,288 @@
|
|||||||
|
//! This module contains functions to serve per-tenant background processes,
|
||||||
|
//! such as compaction and GC
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::ops::ControlFlow;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use crate::repository::Repository;
|
||||||
|
use crate::tenant_mgr::TenantState;
|
||||||
|
use crate::thread_mgr::ThreadKind;
|
||||||
|
use crate::{tenant_mgr, thread_mgr};
|
||||||
|
use anyhow::{self, Context};
|
||||||
|
use futures::stream::FuturesUnordered;
|
||||||
|
use futures::StreamExt;
|
||||||
|
use metrics::{register_int_counter_vec, IntCounterVec};
|
||||||
|
use once_cell::sync::{Lazy, OnceCell};
|
||||||
|
use tokio::sync::mpsc;
|
||||||
|
use tokio::sync::watch;
|
||||||
|
use tracing::*;
|
||||||
|
use utils::zid::ZTenantId;
|
||||||
|
|
||||||
|
static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
|
register_int_counter_vec!(
|
||||||
|
"pageserver_tenant_task_events",
|
||||||
|
"Number of task start/stop/fail events.",
|
||||||
|
&["event"],
|
||||||
|
)
|
||||||
|
.expect("Failed to register tenant_task_events metric")
|
||||||
|
});
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Compaction task's main loop
|
||||||
|
///
|
||||||
|
async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
|
||||||
|
loop {
|
||||||
|
trace!("waking up");
|
||||||
|
|
||||||
|
// Run blocking part of the task
|
||||||
|
let period: Result<Result<_, anyhow::Error>, _> = tokio::task::spawn_blocking(move || {
|
||||||
|
// Break if tenant is not active
|
||||||
|
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||||
|
return Ok(ControlFlow::Break(()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Break if we're not allowed to write to disk
|
||||||
|
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||||
|
// TODO do this inside repo.compaction_iteration instead.
|
||||||
|
let _guard = match repo.file_lock.try_read() {
|
||||||
|
Ok(g) => g,
|
||||||
|
Err(_) => return Ok(ControlFlow::Break(())),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Run compaction
|
||||||
|
let compaction_period = repo.get_compaction_period();
|
||||||
|
repo.compaction_iteration()?;
|
||||||
|
Ok(ControlFlow::Continue(compaction_period))
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Decide whether to sleep or break
|
||||||
|
let sleep_duration = match period {
|
||||||
|
Ok(Ok(ControlFlow::Continue(period))) => period,
|
||||||
|
Ok(Ok(ControlFlow::Break(()))) => break,
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
error!("Compaction failed, retrying: {}", e);
|
||||||
|
Duration::from_secs(2)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Compaction join error, retrying: {}", e);
|
||||||
|
Duration::from_secs(2)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Sleep
|
||||||
|
tokio::select! {
|
||||||
|
_ = cancel.changed() => {
|
||||||
|
trace!("received cancellation request");
|
||||||
|
break;
|
||||||
|
},
|
||||||
|
_ = tokio::time::sleep(sleep_duration) => {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
trace!(
|
||||||
|
"compaction loop stopped. State is {:?}",
|
||||||
|
tenant_mgr::get_tenant_state(tenantid)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static START_GC_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
|
||||||
|
static START_COMPACTION_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
|
||||||
|
|
||||||
|
/// Spawn a task that will periodically schedule garbage collection until
|
||||||
|
/// the tenant becomes inactive. This should be called on tenant
|
||||||
|
/// activation.
|
||||||
|
pub fn start_gc_loop(tenantid: ZTenantId) -> anyhow::Result<()> {
|
||||||
|
START_GC_LOOP
|
||||||
|
.get()
|
||||||
|
.context("Failed to get START_GC_LOOP")?
|
||||||
|
.blocking_send(tenantid)
|
||||||
|
.context("Failed to send to START_GC_LOOP channel")?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Spawn a task that will periodically schedule compaction until
|
||||||
|
/// the tenant becomes inactive. This should be called on tenant
|
||||||
|
/// activation.
|
||||||
|
pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> {
|
||||||
|
START_COMPACTION_LOOP
|
||||||
|
.get()
|
||||||
|
.context("failed to get START_COMPACTION_LOOP")?
|
||||||
|
.blocking_send(tenantid)
|
||||||
|
.context("failed to send to START_COMPACTION_LOOP")?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Spawn the TenantTaskManager
|
||||||
|
/// This needs to be called before start_gc_loop or start_compaction_loop
|
||||||
|
pub fn init_tenant_task_pool() -> anyhow::Result<()> {
|
||||||
|
let runtime = tokio::runtime::Builder::new_multi_thread()
|
||||||
|
.thread_name("tenant-task-worker")
|
||||||
|
.worker_threads(40) // Way more than necessary
|
||||||
|
.max_blocking_threads(100) // Way more than necessary
|
||||||
|
.enable_all()
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
let (gc_send, mut gc_recv) = mpsc::channel::<ZTenantId>(100);
|
||||||
|
START_GC_LOOP
|
||||||
|
.set(gc_send)
|
||||||
|
.expect("Failed to set START_GC_LOOP");
|
||||||
|
|
||||||
|
let (compaction_send, mut compaction_recv) = mpsc::channel::<ZTenantId>(100);
|
||||||
|
START_COMPACTION_LOOP
|
||||||
|
.set(compaction_send)
|
||||||
|
.expect("Failed to set START_COMPACTION_LOOP");
|
||||||
|
|
||||||
|
// TODO this is getting repetitive
|
||||||
|
let mut gc_loops = HashMap::<ZTenantId, watch::Sender<()>>::new();
|
||||||
|
let mut compaction_loops = HashMap::<ZTenantId, watch::Sender<()>>::new();
|
||||||
|
|
||||||
|
thread_mgr::spawn(
|
||||||
|
ThreadKind::TenantTaskManager,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
"Tenant task manager main thread",
|
||||||
|
true,
|
||||||
|
move || {
|
||||||
|
runtime.block_on(async move {
|
||||||
|
let mut futures = FuturesUnordered::new();
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
_ = thread_mgr::shutdown_watcher() => {
|
||||||
|
// Send cancellation to all tasks
|
||||||
|
for (_, cancel) in gc_loops.drain() {
|
||||||
|
cancel.send(()).ok();
|
||||||
|
}
|
||||||
|
for (_, cancel) in compaction_loops.drain() {
|
||||||
|
cancel.send(()).ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Exit after all tasks finish
|
||||||
|
while let Some(result) = futures.next().await {
|
||||||
|
match result {
|
||||||
|
Ok(()) => {
|
||||||
|
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc();
|
||||||
|
error!("loop join error {}", e)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
},
|
||||||
|
tenantid = gc_recv.recv() => {
|
||||||
|
let tenantid = tenantid.expect("Gc task channel closed unexpectedly");
|
||||||
|
|
||||||
|
// Spawn new task, request cancellation of the old one if exists
|
||||||
|
let (cancel_send, cancel_recv) = watch::channel(());
|
||||||
|
let handle = tokio::spawn(gc_loop(tenantid, cancel_recv)
|
||||||
|
.instrument(info_span!("gc loop", tenant = %tenantid)));
|
||||||
|
if let Some(old_cancel_send) = gc_loops.insert(tenantid, cancel_send) {
|
||||||
|
old_cancel_send.send(()).ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update metrics, remember handle
|
||||||
|
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||||
|
futures.push(handle);
|
||||||
|
},
|
||||||
|
tenantid = compaction_recv.recv() => {
|
||||||
|
let tenantid = tenantid.expect("Compaction task channel closed unexpectedly");
|
||||||
|
|
||||||
|
// Spawn new task, request cancellation of the old one if exists
|
||||||
|
let (cancel_send, cancel_recv) = watch::channel(());
|
||||||
|
let handle = tokio::spawn(compaction_loop(tenantid, cancel_recv)
|
||||||
|
.instrument(info_span!("compaction loop", tenant = %tenantid)));
|
||||||
|
if let Some(old_cancel_send) = compaction_loops.insert(tenantid, cancel_send) {
|
||||||
|
old_cancel_send.send(()).ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update metrics, remember handle
|
||||||
|
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||||
|
futures.push(handle);
|
||||||
|
},
|
||||||
|
result = futures.next() => {
|
||||||
|
// Log and count any unhandled panics
|
||||||
|
match result {
|
||||||
|
Some(Ok(())) => {
|
||||||
|
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||||
|
},
|
||||||
|
Some(Err(e)) => {
|
||||||
|
TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc();
|
||||||
|
error!("loop join error {}", e)
|
||||||
|
},
|
||||||
|
None => {},
|
||||||
|
};
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// GC task's main loop
|
||||||
|
///
|
||||||
|
async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
|
||||||
|
loop {
|
||||||
|
trace!("waking up");
|
||||||
|
|
||||||
|
// Run blocking part of the task
|
||||||
|
let period: Result<Result<_, anyhow::Error>, _> = tokio::task::spawn_blocking(move || {
|
||||||
|
// Break if tenant is not active
|
||||||
|
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||||
|
return Ok(ControlFlow::Break(()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Break if we're not allowed to write to disk
|
||||||
|
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||||
|
// TODO do this inside repo.gc_iteration instead.
|
||||||
|
let _guard = match repo.file_lock.try_read() {
|
||||||
|
Ok(g) => g,
|
||||||
|
Err(_) => return Ok(ControlFlow::Break(())),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Run gc
|
||||||
|
let gc_period = repo.get_gc_period();
|
||||||
|
let gc_horizon = repo.get_gc_horizon();
|
||||||
|
if gc_horizon > 0 {
|
||||||
|
repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(ControlFlow::Continue(gc_period))
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Decide whether to sleep or break
|
||||||
|
let sleep_duration = match period {
|
||||||
|
Ok(Ok(ControlFlow::Continue(period))) => period,
|
||||||
|
Ok(Ok(ControlFlow::Break(()))) => break,
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
error!("Gc failed, retrying: {}", e);
|
||||||
|
Duration::from_secs(2)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Gc join error, retrying: {}", e);
|
||||||
|
Duration::from_secs(2)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Sleep
|
||||||
|
tokio::select! {
|
||||||
|
_ = cancel.changed() => {
|
||||||
|
trace!("received cancellation request");
|
||||||
|
break;
|
||||||
|
},
|
||||||
|
_ = tokio::time::sleep(sleep_duration) => {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
trace!(
|
||||||
|
"GC loop stopped. State is {:?}",
|
||||||
|
tenant_mgr::get_tenant_state(tenantid)
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -1,79 +0,0 @@
|
|||||||
//! This module contains functions to serve per-tenant background processes,
|
|
||||||
//! such as compaction and GC
|
|
||||||
use crate::repository::Repository;
|
|
||||||
use crate::tenant_mgr;
|
|
||||||
use crate::tenant_mgr::TenantState;
|
|
||||||
use anyhow::Result;
|
|
||||||
use std::time::Duration;
|
|
||||||
use tracing::*;
|
|
||||||
use utils::zid::ZTenantId;
|
|
||||||
|
|
||||||
///
|
|
||||||
/// Compaction thread's main loop
|
|
||||||
///
|
|
||||||
pub fn compact_loop(tenantid: ZTenantId) -> Result<()> {
|
|
||||||
if let Err(err) = compact_loop_ext(tenantid) {
|
|
||||||
error!("compact loop terminated with error: {:?}", err);
|
|
||||||
Err(err)
|
|
||||||
} else {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn compact_loop_ext(tenantid: ZTenantId) -> Result<()> {
|
|
||||||
loop {
|
|
||||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
|
||||||
let compaction_period = repo.get_compaction_period();
|
|
||||||
|
|
||||||
std::thread::sleep(compaction_period);
|
|
||||||
trace!("compaction thread for tenant {} waking up", tenantid);
|
|
||||||
|
|
||||||
// Compact timelines
|
|
||||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
|
||||||
repo.compaction_iteration()?;
|
|
||||||
}
|
|
||||||
|
|
||||||
trace!(
|
|
||||||
"compaction thread stopped for tenant {} state is {:?}",
|
|
||||||
tenantid,
|
|
||||||
tenant_mgr::get_tenant_state(tenantid)
|
|
||||||
);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
/// GC thread's main loop
|
|
||||||
///
|
|
||||||
pub fn gc_loop(tenantid: ZTenantId) -> Result<()> {
|
|
||||||
loop {
|
|
||||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
trace!("gc thread for tenant {} waking up", tenantid);
|
|
||||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
|
||||||
let gc_horizon = repo.get_gc_horizon();
|
|
||||||
// Garbage collect old files that are not needed for PITR anymore
|
|
||||||
if gc_horizon > 0 {
|
|
||||||
repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO Write it in more adequate way using
|
|
||||||
// condvar.wait_timeout() or something
|
|
||||||
let mut sleep_time = repo.get_gc_period().as_secs();
|
|
||||||
while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == Some(TenantState::Active)
|
|
||||||
{
|
|
||||||
sleep_time -= 1;
|
|
||||||
std::thread::sleep(Duration::from_secs(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
trace!(
|
|
||||||
"GC thread stopped for tenant {} state is {:?}",
|
|
||||||
tenantid,
|
|
||||||
tenant_mgr::get_tenant_state(tenantid)
|
|
||||||
);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -94,11 +94,8 @@ pub enum ThreadKind {
|
|||||||
// Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL.
|
// Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL.
|
||||||
WalReceiverManager,
|
WalReceiverManager,
|
||||||
|
|
||||||
// Thread that handles compaction of all timelines for a tenant.
|
// Thread that schedules new compaction and gc jobs
|
||||||
Compactor,
|
TenantTaskManager,
|
||||||
|
|
||||||
// Thread that handles GC of a tenant
|
|
||||||
GarbageCollector,
|
|
||||||
|
|
||||||
// Thread that flushes frozen in-memory layers to disk
|
// Thread that flushes frozen in-memory layers to disk
|
||||||
LayerFlushThread,
|
LayerFlushThread,
|
||||||
@@ -108,15 +105,21 @@ pub enum ThreadKind {
|
|||||||
StorageSync,
|
StorageSync,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct MutableThreadState {
|
||||||
|
/// Tenant and timeline that this thread is associated with.
|
||||||
|
tenant_id: Option<ZTenantId>,
|
||||||
|
timeline_id: Option<ZTimelineId>,
|
||||||
|
|
||||||
|
/// Handle for waiting for the thread to exit. It can be None, if the
|
||||||
|
/// the thread has already exited.
|
||||||
|
join_handle: Option<JoinHandle<()>>,
|
||||||
|
}
|
||||||
|
|
||||||
struct PageServerThread {
|
struct PageServerThread {
|
||||||
_thread_id: u64,
|
_thread_id: u64,
|
||||||
|
|
||||||
kind: ThreadKind,
|
kind: ThreadKind,
|
||||||
|
|
||||||
/// Tenant and timeline that this thread is associated with.
|
|
||||||
tenant_id: Option<ZTenantId>,
|
|
||||||
timeline_id: Option<ZTimelineId>,
|
|
||||||
|
|
||||||
name: String,
|
name: String,
|
||||||
|
|
||||||
// To request thread shutdown, set the flag, and send a dummy message to the
|
// To request thread shutdown, set the flag, and send a dummy message to the
|
||||||
@@ -124,9 +127,7 @@ struct PageServerThread {
|
|||||||
shutdown_requested: AtomicBool,
|
shutdown_requested: AtomicBool,
|
||||||
shutdown_tx: watch::Sender<()>,
|
shutdown_tx: watch::Sender<()>,
|
||||||
|
|
||||||
/// Handle for waiting for the thread to exit. It can be None, if the
|
mutable: Mutex<MutableThreadState>,
|
||||||
/// the thread has already exited.
|
|
||||||
join_handle: Mutex<Option<JoinHandle<()>>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Launch a new thread
|
/// Launch a new thread
|
||||||
@@ -145,29 +146,27 @@ where
|
|||||||
{
|
{
|
||||||
let (shutdown_tx, shutdown_rx) = watch::channel(());
|
let (shutdown_tx, shutdown_rx) = watch::channel(());
|
||||||
let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed);
|
let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed);
|
||||||
let thread = PageServerThread {
|
let thread = Arc::new(PageServerThread {
|
||||||
_thread_id: thread_id,
|
_thread_id: thread_id,
|
||||||
kind,
|
kind,
|
||||||
tenant_id,
|
|
||||||
timeline_id,
|
|
||||||
name: name.to_string(),
|
name: name.to_string(),
|
||||||
|
|
||||||
shutdown_requested: AtomicBool::new(false),
|
shutdown_requested: AtomicBool::new(false),
|
||||||
shutdown_tx,
|
shutdown_tx,
|
||||||
|
mutable: Mutex::new(MutableThreadState {
|
||||||
join_handle: Mutex::new(None),
|
tenant_id,
|
||||||
};
|
timeline_id,
|
||||||
|
join_handle: None,
|
||||||
let thread_rc = Arc::new(thread);
|
}),
|
||||||
|
});
|
||||||
let mut jh_guard = thread_rc.join_handle.lock().unwrap();
|
|
||||||
|
|
||||||
THREADS
|
THREADS
|
||||||
.lock()
|
.lock()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.insert(thread_id, Arc::clone(&thread_rc));
|
.insert(thread_id, Arc::clone(&thread));
|
||||||
|
|
||||||
let thread_rc2 = Arc::clone(&thread_rc);
|
let mut thread_mut = thread.mutable.lock().unwrap();
|
||||||
|
|
||||||
|
let thread_cloned = Arc::clone(&thread);
|
||||||
let thread_name = name.to_string();
|
let thread_name = name.to_string();
|
||||||
let join_handle = match thread::Builder::new()
|
let join_handle = match thread::Builder::new()
|
||||||
.name(name.to_string())
|
.name(name.to_string())
|
||||||
@@ -175,7 +174,7 @@ where
|
|||||||
thread_wrapper(
|
thread_wrapper(
|
||||||
thread_name,
|
thread_name,
|
||||||
thread_id,
|
thread_id,
|
||||||
thread_rc2,
|
thread_cloned,
|
||||||
shutdown_rx,
|
shutdown_rx,
|
||||||
shutdown_process_on_error,
|
shutdown_process_on_error,
|
||||||
f,
|
f,
|
||||||
@@ -189,8 +188,8 @@ where
|
|||||||
return Err(err);
|
return Err(err);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
*jh_guard = Some(join_handle);
|
thread_mut.join_handle = Some(join_handle);
|
||||||
drop(jh_guard);
|
drop(thread_mut);
|
||||||
|
|
||||||
// The thread is now running. Nothing more to do here
|
// The thread is now running. Nothing more to do here
|
||||||
Ok(thread_id)
|
Ok(thread_id)
|
||||||
@@ -229,19 +228,20 @@ fn thread_wrapper<F>(
|
|||||||
.remove(&thread_id)
|
.remove(&thread_id)
|
||||||
.expect("no thread in registry");
|
.expect("no thread in registry");
|
||||||
|
|
||||||
|
let thread_mut = thread.mutable.lock().unwrap();
|
||||||
match result {
|
match result {
|
||||||
Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name),
|
Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name),
|
||||||
Ok(Err(err)) => {
|
Ok(Err(err)) => {
|
||||||
if shutdown_process_on_error {
|
if shutdown_process_on_error {
|
||||||
error!(
|
error!(
|
||||||
"Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
"Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||||
thread_name, thread.tenant_id, thread.timeline_id, err
|
thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err
|
||||||
);
|
);
|
||||||
shutdown_pageserver(1);
|
shutdown_pageserver(1);
|
||||||
} else {
|
} else {
|
||||||
error!(
|
error!(
|
||||||
"Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
"Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||||
thread_name, thread.tenant_id, thread.timeline_id, err
|
thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -249,19 +249,29 @@ fn thread_wrapper<F>(
|
|||||||
if shutdown_process_on_error {
|
if shutdown_process_on_error {
|
||||||
error!(
|
error!(
|
||||||
"Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
"Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||||
thread_name, thread.tenant_id, thread.timeline_id, err
|
thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err
|
||||||
);
|
);
|
||||||
shutdown_pageserver(1);
|
shutdown_pageserver(1);
|
||||||
} else {
|
} else {
|
||||||
error!(
|
error!(
|
||||||
"Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
"Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||||
thread_name, thread.tenant_id, thread.timeline_id, err
|
thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// expected to be called from the thread of the given id.
|
||||||
|
pub fn associate_with(tenant_id: Option<ZTenantId>, timeline_id: Option<ZTimelineId>) {
|
||||||
|
CURRENT_THREAD.with(|ct| {
|
||||||
|
let borrowed = ct.borrow();
|
||||||
|
let mut thread_mut = borrowed.as_ref().unwrap().mutable.lock().unwrap();
|
||||||
|
thread_mut.tenant_id = tenant_id;
|
||||||
|
thread_mut.timeline_id = timeline_id;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/// Is there a thread running that matches the criteria
|
/// Is there a thread running that matches the criteria
|
||||||
|
|
||||||
/// Signal and wait for threads to shut down.
|
/// Signal and wait for threads to shut down.
|
||||||
@@ -285,9 +295,10 @@ pub fn shutdown_threads(
|
|||||||
|
|
||||||
let threads = THREADS.lock().unwrap();
|
let threads = THREADS.lock().unwrap();
|
||||||
for thread in threads.values() {
|
for thread in threads.values() {
|
||||||
|
let thread_mut = thread.mutable.lock().unwrap();
|
||||||
if (kind.is_none() || Some(thread.kind) == kind)
|
if (kind.is_none() || Some(thread.kind) == kind)
|
||||||
&& (tenant_id.is_none() || thread.tenant_id == tenant_id)
|
&& (tenant_id.is_none() || thread_mut.tenant_id == tenant_id)
|
||||||
&& (timeline_id.is_none() || thread.timeline_id == timeline_id)
|
&& (timeline_id.is_none() || thread_mut.timeline_id == timeline_id)
|
||||||
{
|
{
|
||||||
thread.shutdown_requested.store(true, Ordering::Relaxed);
|
thread.shutdown_requested.store(true, Ordering::Relaxed);
|
||||||
// FIXME: handle error?
|
// FIXME: handle error?
|
||||||
@@ -298,8 +309,10 @@ pub fn shutdown_threads(
|
|||||||
drop(threads);
|
drop(threads);
|
||||||
|
|
||||||
for thread in victim_threads {
|
for thread in victim_threads {
|
||||||
|
let mut thread_mut = thread.mutable.lock().unwrap();
|
||||||
info!("waiting for {} to shut down", thread.name);
|
info!("waiting for {} to shut down", thread.name);
|
||||||
if let Some(join_handle) = thread.join_handle.lock().unwrap().take() {
|
if let Some(join_handle) = thread_mut.join_handle.take() {
|
||||||
|
drop(thread_mut);
|
||||||
let _ = join_handle.join();
|
let _ = join_handle.join();
|
||||||
} else {
|
} else {
|
||||||
// The thread had not even fully started yet. Or it was shut down
|
// The thread had not even fully started yet. Or it was shut down
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
1221
pageserver/src/walreceiver/connection_manager.rs
Normal file
1221
pageserver/src/walreceiver/connection_manager.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
|||||||
//! Actual Postgres connection handler to stream WAL to the server.
|
//! Actual Postgres connection handler to stream WAL to the server.
|
||||||
//! Runs as a separate, cancellable Tokio task.
|
|
||||||
use std::{
|
use std::{
|
||||||
str::FromStr,
|
str::FromStr,
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
@@ -10,113 +10,29 @@ use anyhow::{bail, ensure, Context};
|
|||||||
use bytes::BytesMut;
|
use bytes::BytesMut;
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
use postgres::{SimpleQueryMessage, SimpleQueryRow};
|
use postgres::{SimpleQueryMessage, SimpleQueryRow};
|
||||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
|
||||||
use postgres_protocol::message::backend::ReplicationMessage;
|
use postgres_protocol::message::backend::ReplicationMessage;
|
||||||
use postgres_types::PgLsn;
|
use postgres_types::PgLsn;
|
||||||
use tokio::{pin, select, sync::watch, time};
|
use tokio::{pin, select, sync::watch, time};
|
||||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||||
use tokio_stream::StreamExt;
|
use tokio_stream::StreamExt;
|
||||||
use tracing::{debug, error, info, info_span, trace, warn, Instrument};
|
use tracing::{debug, error, info, info_span, trace, warn, Instrument};
|
||||||
use utils::{
|
|
||||||
lsn::Lsn,
|
|
||||||
pq_proto::ZenithFeedback,
|
|
||||||
zid::{NodeId, ZTenantTimelineId},
|
|
||||||
};
|
|
||||||
|
|
||||||
|
use super::TaskEvent;
|
||||||
use crate::{
|
use crate::{
|
||||||
http::models::WalReceiverEntry,
|
http::models::WalReceiverEntry,
|
||||||
repository::{Repository, Timeline},
|
repository::{Repository, Timeline},
|
||||||
tenant_mgr,
|
tenant_mgr,
|
||||||
walingest::WalIngest,
|
walingest::WalIngest,
|
||||||
};
|
};
|
||||||
|
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||||
|
use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId};
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
/// Opens a conneciton to the given wal producer and streams the WAL, sending progress messages during streaming.
|
||||||
pub enum WalConnectionEvent {
|
pub async fn handle_walreceiver_connection(
|
||||||
Started,
|
|
||||||
NewWal(ZenithFeedback),
|
|
||||||
End(Result<(), String>),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A wrapper around standalone Tokio task, to poll its updates or cancel the task.
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct WalReceiverConnection {
|
|
||||||
handle: tokio::task::JoinHandle<()>,
|
|
||||||
cancellation: watch::Sender<()>,
|
|
||||||
events_receiver: watch::Receiver<WalConnectionEvent>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl WalReceiverConnection {
|
|
||||||
/// Initializes the connection task, returning a set of handles on top of it.
|
|
||||||
/// The task is started immediately after the creation, fails if no connection is established during the timeout given.
|
|
||||||
pub fn open(
|
|
||||||
id: ZTenantTimelineId,
|
|
||||||
safekeeper_id: NodeId,
|
|
||||||
wal_producer_connstr: String,
|
|
||||||
connect_timeout: Duration,
|
|
||||||
) -> Self {
|
|
||||||
let (cancellation, mut cancellation_receiver) = watch::channel(());
|
|
||||||
let (events_sender, events_receiver) = watch::channel(WalConnectionEvent::Started);
|
|
||||||
|
|
||||||
let handle = tokio::spawn(
|
|
||||||
async move {
|
|
||||||
let connection_result = handle_walreceiver_connection(
|
|
||||||
id,
|
|
||||||
&wal_producer_connstr,
|
|
||||||
&events_sender,
|
|
||||||
&mut cancellation_receiver,
|
|
||||||
connect_timeout,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.map_err(|e| {
|
|
||||||
format!("Walreceiver connection for id {id} failed with error: {e:#}")
|
|
||||||
});
|
|
||||||
|
|
||||||
match &connection_result {
|
|
||||||
Ok(()) => {
|
|
||||||
debug!("Walreceiver connection for id {id} ended successfully")
|
|
||||||
}
|
|
||||||
Err(e) => warn!("{e}"),
|
|
||||||
}
|
|
||||||
events_sender
|
|
||||||
.send(WalConnectionEvent::End(connection_result))
|
|
||||||
.ok();
|
|
||||||
}
|
|
||||||
.instrument(info_span!("safekeeper_handle", sk = %safekeeper_id)),
|
|
||||||
);
|
|
||||||
|
|
||||||
Self {
|
|
||||||
handle,
|
|
||||||
cancellation,
|
|
||||||
events_receiver,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Polls for the next WAL receiver event, if there's any available since the last check.
|
|
||||||
/// Blocks if there's no new event available, returns `None` if no new events will ever occur.
|
|
||||||
/// Only the last event is returned, all events received between observatins are lost.
|
|
||||||
pub async fn next_event(&mut self) -> Option<WalConnectionEvent> {
|
|
||||||
match self.events_receiver.changed().await {
|
|
||||||
Ok(()) => Some(self.events_receiver.borrow().clone()),
|
|
||||||
Err(_cancellation_error) => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Gracefully aborts current WAL streaming task, waiting for the current WAL streamed.
|
|
||||||
pub async fn shutdown(&mut self) -> anyhow::Result<()> {
|
|
||||||
self.cancellation.send(()).ok();
|
|
||||||
let handle = &mut self.handle;
|
|
||||||
handle
|
|
||||||
.await
|
|
||||||
.context("Failed to join on a walreceiver connection task")?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_walreceiver_connection(
|
|
||||||
id: ZTenantTimelineId,
|
id: ZTenantTimelineId,
|
||||||
wal_producer_connstr: &str,
|
wal_producer_connstr: &str,
|
||||||
events_sender: &watch::Sender<WalConnectionEvent>,
|
events_sender: &watch::Sender<TaskEvent<ReplicationFeedback>>,
|
||||||
cancellation: &mut watch::Receiver<()>,
|
mut cancellation: watch::Receiver<()>,
|
||||||
connect_timeout: Duration,
|
connect_timeout: Duration,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Connect to the database in replication mode.
|
// Connect to the database in replication mode.
|
||||||
@@ -214,8 +130,6 @@ async fn handle_walreceiver_connection(
|
|||||||
|
|
||||||
while let Some(replication_message) = {
|
while let Some(replication_message) = {
|
||||||
select! {
|
select! {
|
||||||
// check for shutdown first
|
|
||||||
biased;
|
|
||||||
_ = cancellation.changed() => {
|
_ = cancellation.changed() => {
|
||||||
info!("walreceiver interrupted");
|
info!("walreceiver interrupted");
|
||||||
None
|
None
|
||||||
@@ -328,7 +242,7 @@ async fn handle_walreceiver_connection(
|
|||||||
|
|
||||||
// Send zenith feedback message.
|
// Send zenith feedback message.
|
||||||
// Regular standby_status_update fields are put into this message.
|
// Regular standby_status_update fields are put into this message.
|
||||||
let zenith_status_update = ZenithFeedback {
|
let zenith_status_update = ReplicationFeedback {
|
||||||
current_timeline_size: timeline.get_current_logical_size() as u64,
|
current_timeline_size: timeline.get_current_logical_size() as u64,
|
||||||
ps_writelsn: write_lsn,
|
ps_writelsn: write_lsn,
|
||||||
ps_flushlsn: flush_lsn,
|
ps_flushlsn: flush_lsn,
|
||||||
@@ -344,7 +258,7 @@ async fn handle_walreceiver_connection(
|
|||||||
.as_mut()
|
.as_mut()
|
||||||
.zenith_status_update(data.len() as u64, &data)
|
.zenith_status_update(data.len() as u64, &data)
|
||||||
.await?;
|
.await?;
|
||||||
if let Err(e) = events_sender.send(WalConnectionEvent::NewWal(zenith_status_update)) {
|
if let Err(e) = events_sender.send(TaskEvent::NewEvent(zenith_status_update)) {
|
||||||
warn!("Wal connection event listener dropped, aborting the connection: {e}");
|
warn!("Wal connection event listener dropped, aborting the connection: {e}");
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
@@ -39,6 +39,8 @@ utils = { path = "../libs/utils" }
|
|||||||
metrics = { path = "../libs/metrics" }
|
metrics = { path = "../libs/metrics" }
|
||||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
|
|
||||||
|
x509-parser = "0.13.2"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
rcgen = "0.8.14"
|
rcgen = "0.8.14"
|
||||||
rstest = "0.12"
|
rstest = "0.12"
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ pub type Result<T> = std::result::Result<T, ConsoleAuthError>;
|
|||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
pub enum ConsoleAuthError {
|
pub enum ConsoleAuthError {
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
BadProjectName(#[from] auth::credentials::ProjectNameError),
|
BadProjectName(#[from] auth::credentials::ClientCredsParseError),
|
||||||
|
|
||||||
// We shouldn't include the actual secret here.
|
// We shouldn't include the actual secret here.
|
||||||
#[error("Bad authentication secret")]
|
#[error("Bad authentication secret")]
|
||||||
@@ -49,6 +49,12 @@ impl UserFacingError for ConsoleAuthError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<&auth::credentials::ClientCredsParseError> for ConsoleAuthError {
|
||||||
|
fn from(e: &auth::credentials::ClientCredsParseError) -> Self {
|
||||||
|
ConsoleAuthError::BadProjectName(e.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: convert into an enum with "error"
|
// TODO: convert into an enum with "error"
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
struct GetRoleSecretResponse {
|
struct GetRoleSecretResponse {
|
||||||
@@ -74,18 +80,12 @@ pub enum AuthInfo {
|
|||||||
pub(super) struct Api<'a> {
|
pub(super) struct Api<'a> {
|
||||||
endpoint: &'a ApiUrl,
|
endpoint: &'a ApiUrl,
|
||||||
creds: &'a ClientCredentials,
|
creds: &'a ClientCredentials,
|
||||||
/// Cache project name, since we'll need it several times.
|
|
||||||
project: &'a str,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Api<'a> {
|
impl<'a> Api<'a> {
|
||||||
/// Construct an API object containing the auth parameters.
|
/// Construct an API object containing the auth parameters.
|
||||||
pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result<Self> {
|
pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result<Self> {
|
||||||
Ok(Self {
|
Ok(Self { endpoint, creds })
|
||||||
endpoint,
|
|
||||||
creds,
|
|
||||||
project: creds.project_name()?,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Authenticate the existing user or throw an error.
|
/// Authenticate the existing user or throw an error.
|
||||||
@@ -100,7 +100,7 @@ impl<'a> Api<'a> {
|
|||||||
let mut url = self.endpoint.clone();
|
let mut url = self.endpoint.clone();
|
||||||
url.path_segments_mut().push("proxy_get_role_secret");
|
url.path_segments_mut().push("proxy_get_role_secret");
|
||||||
url.query_pairs_mut()
|
url.query_pairs_mut()
|
||||||
.append_pair("project", self.project)
|
.append_pair("project", self.creds.project_name.as_ref()?)
|
||||||
.append_pair("role", &self.creds.user);
|
.append_pair("role", &self.creds.user);
|
||||||
|
|
||||||
// TODO: use a proper logger
|
// TODO: use a proper logger
|
||||||
@@ -123,7 +123,8 @@ impl<'a> Api<'a> {
|
|||||||
async fn wake_compute(&self) -> Result<DatabaseInfo> {
|
async fn wake_compute(&self) -> Result<DatabaseInfo> {
|
||||||
let mut url = self.endpoint.clone();
|
let mut url = self.endpoint.clone();
|
||||||
url.path_segments_mut().push("proxy_wake_compute");
|
url.path_segments_mut().push("proxy_wake_compute");
|
||||||
url.query_pairs_mut().append_pair("project", self.project);
|
let project_name = self.creds.project_name.as_ref()?;
|
||||||
|
url.query_pairs_mut().append_pair("project", project_name);
|
||||||
|
|
||||||
// TODO: use a proper logger
|
// TODO: use a proper logger
|
||||||
println!("cplane request: {url}");
|
println!("cplane request: {url}");
|
||||||
|
|||||||
@@ -8,10 +8,32 @@ use std::collections::HashMap;
|
|||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error, PartialEq, Eq, Clone)]
|
||||||
pub enum ClientCredsParseError {
|
pub enum ClientCredsParseError {
|
||||||
#[error("Parameter `{0}` is missing in startup packet")]
|
#[error("Parameter `{0}` is missing in startup packet.")]
|
||||||
MissingKey(&'static str),
|
MissingKey(&'static str),
|
||||||
|
|
||||||
|
#[error(
|
||||||
|
"Project name is not specified. \
|
||||||
|
EITHER please upgrade the postgres client library (libpq) for SNI support \
|
||||||
|
OR pass the project name as a parameter: '&options=project%3D<project-name>'."
|
||||||
|
)]
|
||||||
|
MissingSNIAndProjectName,
|
||||||
|
|
||||||
|
#[error("Inconsistent project name inferred from SNI ('{0}') and project option ('{1}').")]
|
||||||
|
InconsistentProjectNameAndSNI(String, String),
|
||||||
|
|
||||||
|
#[error("Common name is not set.")]
|
||||||
|
CommonNameNotSet,
|
||||||
|
|
||||||
|
#[error(
|
||||||
|
"SNI ('{1}') inconsistently formatted with respect to common name ('{0}'). \
|
||||||
|
SNI should be formatted as '<project-name>.<common-name>'."
|
||||||
|
)]
|
||||||
|
InconsistentCommonNameAndSNI(String, String),
|
||||||
|
|
||||||
|
#[error("Project name ('{0}') must contain only alphanumeric characters and hyphens ('-').")]
|
||||||
|
ProjectNameContainsIllegalChars(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl UserFacingError for ClientCredsParseError {}
|
impl UserFacingError for ClientCredsParseError {}
|
||||||
@@ -22,15 +44,7 @@ impl UserFacingError for ClientCredsParseError {}
|
|||||||
pub struct ClientCredentials {
|
pub struct ClientCredentials {
|
||||||
pub user: String,
|
pub user: String,
|
||||||
pub dbname: String,
|
pub dbname: String,
|
||||||
|
pub project_name: Result<String, ClientCredsParseError>,
|
||||||
// New console API requires SNI info to determine the cluster name.
|
|
||||||
// Other Auth backends don't need it.
|
|
||||||
pub sni_data: Option<String>,
|
|
||||||
|
|
||||||
// project_name is passed as argument from options from url.
|
|
||||||
// In case sni_data is missing: project_name is used to determine cluster name.
|
|
||||||
// In case sni_data is available: project_name and sni_data should match (otherwise throws an error).
|
|
||||||
pub project_name: Option<String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ClientCredentials {
|
impl ClientCredentials {
|
||||||
@@ -38,60 +52,14 @@ impl ClientCredentials {
|
|||||||
// This logic will likely change in the future.
|
// This logic will likely change in the future.
|
||||||
self.user.ends_with("@zenith")
|
self.user.ends_with("@zenith")
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
pub fn parse(
|
||||||
pub enum ProjectNameError {
|
mut options: HashMap<String, String>,
|
||||||
#[error("SNI is missing. EITHER please upgrade the postgres client library OR pass the project name as a parameter: '...&options=project%3D<project-name>...'.")]
|
sni_data: Option<&str>,
|
||||||
Missing,
|
common_name: Option<&str>,
|
||||||
|
) -> Result<Self, ClientCredsParseError> {
|
||||||
#[error("SNI is malformed.")]
|
|
||||||
Bad,
|
|
||||||
|
|
||||||
#[error("Inconsistent project name inferred from SNI and project option. String from SNI: '{0}', String from project option: '{1}'")]
|
|
||||||
Inconsistent(String, String),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl UserFacingError for ProjectNameError {}
|
|
||||||
|
|
||||||
impl ClientCredentials {
|
|
||||||
/// Determine project name from SNI or from project_name parameter from options argument.
|
|
||||||
pub fn project_name(&self) -> Result<&str, ProjectNameError> {
|
|
||||||
// Checking that if both sni_data and project_name are set, then they should match
|
|
||||||
// otherwise, throws a ProjectNameError::Inconsistent error.
|
|
||||||
if let Some(sni_data) = &self.sni_data {
|
|
||||||
let project_name_from_sni_data =
|
|
||||||
sni_data.split_once('.').ok_or(ProjectNameError::Bad)?.0;
|
|
||||||
if let Some(project_name_from_options) = &self.project_name {
|
|
||||||
if !project_name_from_options.eq(project_name_from_sni_data) {
|
|
||||||
return Err(ProjectNameError::Inconsistent(
|
|
||||||
project_name_from_sni_data.to_string(),
|
|
||||||
project_name_from_options.to_string(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// determine the project name from self.sni_data if it exists, otherwise from self.project_name.
|
|
||||||
let ret = match &self.sni_data {
|
|
||||||
// if sni_data exists, use it to determine project name
|
|
||||||
Some(sni_data) => sni_data.split_once('.').ok_or(ProjectNameError::Bad)?.0,
|
|
||||||
// otherwise use project_option if it was manually set thought options parameter.
|
|
||||||
None => self
|
|
||||||
.project_name
|
|
||||||
.as_ref()
|
|
||||||
.ok_or(ProjectNameError::Missing)?
|
|
||||||
.as_str(),
|
|
||||||
};
|
|
||||||
Ok(ret)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TryFrom<HashMap<String, String>> for ClientCredentials {
|
|
||||||
type Error = ClientCredsParseError;
|
|
||||||
|
|
||||||
fn try_from(mut value: HashMap<String, String>) -> Result<Self, Self::Error> {
|
|
||||||
let mut get_param = |key| {
|
let mut get_param = |key| {
|
||||||
value
|
options
|
||||||
.remove(key)
|
.remove(key)
|
||||||
.ok_or(ClientCredsParseError::MissingKey(key))
|
.ok_or(ClientCredsParseError::MissingKey(key))
|
||||||
};
|
};
|
||||||
@@ -99,17 +67,15 @@ impl TryFrom<HashMap<String, String>> for ClientCredentials {
|
|||||||
let user = get_param("user")?;
|
let user = get_param("user")?;
|
||||||
let dbname = get_param("database")?;
|
let dbname = get_param("database")?;
|
||||||
let project_name = get_param("project").ok();
|
let project_name = get_param("project").ok();
|
||||||
|
let project_name = get_project_name(sni_data, common_name, project_name.as_deref());
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
user,
|
user,
|
||||||
dbname,
|
dbname,
|
||||||
sni_data: None,
|
|
||||||
project_name,
|
project_name,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl ClientCredentials {
|
|
||||||
/// Use credentials to authenticate the user.
|
/// Use credentials to authenticate the user.
|
||||||
pub async fn authenticate(
|
pub async fn authenticate(
|
||||||
self,
|
self,
|
||||||
@@ -120,3 +86,244 @@ impl ClientCredentials {
|
|||||||
super::backend::handle_user(config, client, self).await
|
super::backend::handle_user(config, client, self).await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Inferring project name from sni_data.
|
||||||
|
fn project_name_from_sni_data(
|
||||||
|
sni_data: &str,
|
||||||
|
common_name: &str,
|
||||||
|
) -> Result<String, ClientCredsParseError> {
|
||||||
|
let common_name_with_dot = format!(".{common_name}");
|
||||||
|
// check that ".{common_name_with_dot}" is the actual suffix in sni_data
|
||||||
|
if !sni_data.ends_with(&common_name_with_dot) {
|
||||||
|
return Err(ClientCredsParseError::InconsistentCommonNameAndSNI(
|
||||||
|
common_name.to_string(),
|
||||||
|
sni_data.to_string(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
// return sni_data without the common name suffix.
|
||||||
|
Ok(sni_data
|
||||||
|
.strip_suffix(&common_name_with_dot)
|
||||||
|
.unwrap()
|
||||||
|
.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests_for_project_name_from_sni_data {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn passing() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let sni_data = format!("{target_project_name}.{common_name}");
|
||||||
|
assert_eq!(
|
||||||
|
project_name_from_sni_data(&sni_data, common_name),
|
||||||
|
Ok(target_project_name.to_string())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn throws_inconsistent_common_name_and_sni_data() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let wrong_suffix = "wrongtest.me";
|
||||||
|
assert_eq!(common_name.len(), wrong_suffix.len());
|
||||||
|
let wrong_common_name = format!("wrong{wrong_suffix}");
|
||||||
|
let sni_data = format!("{target_project_name}.{wrong_common_name}");
|
||||||
|
assert_eq!(
|
||||||
|
project_name_from_sni_data(&sni_data, common_name),
|
||||||
|
Err(ClientCredsParseError::InconsistentCommonNameAndSNI(
|
||||||
|
common_name.to_string(),
|
||||||
|
sni_data
|
||||||
|
))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Determine project name from SNI or from project_name parameter from options argument.
|
||||||
|
fn get_project_name(
|
||||||
|
sni_data: Option<&str>,
|
||||||
|
common_name: Option<&str>,
|
||||||
|
project_name: Option<&str>,
|
||||||
|
) -> Result<String, ClientCredsParseError> {
|
||||||
|
// determine the project name from sni_data if it exists, otherwise from project_name.
|
||||||
|
let ret = match sni_data {
|
||||||
|
Some(sni_data) => {
|
||||||
|
let common_name = common_name.ok_or(ClientCredsParseError::CommonNameNotSet)?;
|
||||||
|
let project_name_from_sni = project_name_from_sni_data(sni_data, common_name)?;
|
||||||
|
// check invariant: project name from options and from sni should match
|
||||||
|
if let Some(project_name) = &project_name {
|
||||||
|
if !project_name_from_sni.eq(project_name) {
|
||||||
|
return Err(ClientCredsParseError::InconsistentProjectNameAndSNI(
|
||||||
|
project_name_from_sni,
|
||||||
|
project_name.to_string(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
project_name_from_sni
|
||||||
|
}
|
||||||
|
None => project_name
|
||||||
|
.ok_or(ClientCredsParseError::MissingSNIAndProjectName)?
|
||||||
|
.to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// check formatting invariant: project name must contain only alphanumeric characters and hyphens.
|
||||||
|
if !ret.chars().all(|x: char| x.is_alphanumeric() || x == '-') {
|
||||||
|
return Err(ClientCredsParseError::ProjectNameContainsIllegalChars(ret));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(ret)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests_for_project_name_only {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn passing_from_sni_data_only() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let sni_data = format!("{target_project_name}.{common_name}");
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(Some(&sni_data), Some(common_name), None),
|
||||||
|
Ok(target_project_name.to_string())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn throws_project_name_contains_illegal_chars_from_sni_data_only() {
|
||||||
|
let project_name_prefix = "my-project";
|
||||||
|
let project_name_suffix = "123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
|
||||||
|
for illegal_char_id in 0..256 {
|
||||||
|
let illegal_char = char::from_u32(illegal_char_id).unwrap();
|
||||||
|
if !(illegal_char.is_alphanumeric() || illegal_char == '-')
|
||||||
|
&& illegal_char.to_string().len() == 1
|
||||||
|
{
|
||||||
|
let target_project_name =
|
||||||
|
format!("{project_name_prefix}{illegal_char}{project_name_suffix}");
|
||||||
|
let sni_data = format!("{target_project_name}.{common_name}");
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(Some(&sni_data), Some(common_name), None),
|
||||||
|
Err(ClientCredsParseError::ProjectNameContainsIllegalChars(
|
||||||
|
target_project_name
|
||||||
|
))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn passing_from_project_name_only() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let common_names = [Some("localtest.me"), None];
|
||||||
|
for common_name in common_names {
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(None, common_name, Some(target_project_name)),
|
||||||
|
Ok(target_project_name.to_string())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn throws_project_name_contains_illegal_chars_from_project_name_only() {
|
||||||
|
let project_name_prefix = "my-project";
|
||||||
|
let project_name_suffix = "123";
|
||||||
|
let common_names = [Some("localtest.me"), None];
|
||||||
|
|
||||||
|
for common_name in common_names {
|
||||||
|
for illegal_char_id in 0..256 {
|
||||||
|
let illegal_char: char = char::from_u32(illegal_char_id).unwrap();
|
||||||
|
if !(illegal_char.is_alphanumeric() || illegal_char == '-')
|
||||||
|
&& illegal_char.to_string().len() == 1
|
||||||
|
{
|
||||||
|
let target_project_name =
|
||||||
|
format!("{project_name_prefix}{illegal_char}{project_name_suffix}");
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(None, common_name, Some(&target_project_name)),
|
||||||
|
Err(ClientCredsParseError::ProjectNameContainsIllegalChars(
|
||||||
|
target_project_name
|
||||||
|
))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn passing_from_sni_data_and_project_name() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let sni_data = format!("{target_project_name}.{common_name}");
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(
|
||||||
|
Some(&sni_data),
|
||||||
|
Some(common_name),
|
||||||
|
Some(target_project_name)
|
||||||
|
),
|
||||||
|
Ok(target_project_name.to_string())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn throws_inconsistent_project_name_and_sni() {
|
||||||
|
let project_name_param = "my-project-123";
|
||||||
|
let wrong_project_name = "not-my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let sni_data = format!("{wrong_project_name}.{common_name}");
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(Some(&sni_data), Some(common_name), Some(project_name_param)),
|
||||||
|
Err(ClientCredsParseError::InconsistentProjectNameAndSNI(
|
||||||
|
wrong_project_name.to_string(),
|
||||||
|
project_name_param.to_string()
|
||||||
|
))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn throws_common_name_not_set() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let wrong_project_name = "not-my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let sni_datas = [
|
||||||
|
Some(format!("{wrong_project_name}.{common_name}")),
|
||||||
|
Some(format!("{target_project_name}.{common_name}")),
|
||||||
|
];
|
||||||
|
let project_names = [None, Some(target_project_name)];
|
||||||
|
for sni_data in sni_datas {
|
||||||
|
for project_name_param in project_names {
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(sni_data.as_deref(), None, project_name_param),
|
||||||
|
Err(ClientCredsParseError::CommonNameNotSet)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn throws_inconsistent_common_name_and_sni_data() {
|
||||||
|
let target_project_name = "my-project-123";
|
||||||
|
let wrong_project_name = "not-my-project-123";
|
||||||
|
let common_name = "localtest.me";
|
||||||
|
let wrong_suffix = "wrongtest.me";
|
||||||
|
assert_eq!(common_name.len(), wrong_suffix.len());
|
||||||
|
let wrong_common_name = format!("wrong{wrong_suffix}");
|
||||||
|
let sni_datas = [
|
||||||
|
Some(format!("{wrong_project_name}.{wrong_common_name}")),
|
||||||
|
Some(format!("{target_project_name}.{wrong_common_name}")),
|
||||||
|
];
|
||||||
|
let project_names = [None, Some(target_project_name)];
|
||||||
|
for project_name_param in project_names {
|
||||||
|
for sni_data in &sni_datas {
|
||||||
|
assert_eq!(
|
||||||
|
get_project_name(sni_data.as_deref(), Some(common_name), project_name_param),
|
||||||
|
Err(ClientCredsParseError::InconsistentCommonNameAndSNI(
|
||||||
|
common_name.to_string(),
|
||||||
|
sni_data.clone().unwrap().to_string()
|
||||||
|
))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -36,23 +36,35 @@ pub struct ProxyConfig {
|
|||||||
pub auth_link_uri: ApiUrl,
|
pub auth_link_uri: ApiUrl,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type TlsConfig = Arc<rustls::ServerConfig>;
|
pub struct TlsConfig {
|
||||||
|
pub config: Arc<rustls::ServerConfig>,
|
||||||
|
pub common_name: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TlsConfig {
|
||||||
|
pub fn to_server_config(&self) -> Arc<rustls::ServerConfig> {
|
||||||
|
self.config.clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Configure TLS for the main endpoint.
|
/// Configure TLS for the main endpoint.
|
||||||
pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfig> {
|
pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfig> {
|
||||||
let key = {
|
let key = {
|
||||||
let key_bytes = std::fs::read(key_path).context("TLS key file")?;
|
let key_bytes = std::fs::read(key_path).context("TLS key file")?;
|
||||||
let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||||
.context("couldn't read TLS keys")?;
|
.context(format!("Failed to read TLS keys at '{key_path}'"))?;
|
||||||
|
|
||||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||||
keys.pop().map(rustls::PrivateKey).unwrap()
|
keys.pop().map(rustls::PrivateKey).unwrap()
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let cert_chain_bytes = std::fs::read(cert_path)
|
||||||
|
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
|
||||||
let cert_chain = {
|
let cert_chain = {
|
||||||
let cert_chain_bytes = std::fs::read(cert_path).context("TLS cert file")?;
|
|
||||||
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
|
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
|
||||||
.context("couldn't read TLS certificate chain")?
|
.context(format!(
|
||||||
|
"Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
|
||||||
|
))?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(rustls::Certificate)
|
.map(rustls::Certificate)
|
||||||
.collect()
|
.collect()
|
||||||
@@ -64,7 +76,25 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfi
|
|||||||
// allow TLS 1.2 to be compatible with older client libraries
|
// allow TLS 1.2 to be compatible with older client libraries
|
||||||
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
|
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
|
||||||
.with_no_client_auth()
|
.with_no_client_auth()
|
||||||
.with_single_cert(cert_chain, key)?;
|
.with_single_cert(cert_chain, key)?
|
||||||
|
.into();
|
||||||
|
|
||||||
Ok(config.into())
|
// determine common name from tls-cert (-c server.crt param).
|
||||||
|
// used in asserting project name formatting invariant.
|
||||||
|
let common_name = {
|
||||||
|
let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
|
||||||
|
.context(format!(
|
||||||
|
"Failed to parse PEM object from bytes from file at '{cert_path}'."
|
||||||
|
))?
|
||||||
|
.1;
|
||||||
|
let almost_common_name = pem.parse_x509()?.tbs_certificate.subject.to_string();
|
||||||
|
let expected_prefix = "CN=*.";
|
||||||
|
let common_name = almost_common_name.strip_prefix(expected_prefix);
|
||||||
|
common_name.map(str::to_string)
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(TlsConfig {
|
||||||
|
config,
|
||||||
|
common_name,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ async fn handle_client(
|
|||||||
NUM_CONNECTIONS_CLOSED_COUNTER.inc();
|
NUM_CONNECTIONS_CLOSED_COUNTER.inc();
|
||||||
}
|
}
|
||||||
|
|
||||||
let tls = config.tls_config.clone();
|
let tls = config.tls_config.as_ref();
|
||||||
let (stream, creds) = match handshake(stream, tls, cancel_map).await? {
|
let (stream, creds) = match handshake(stream, tls, cancel_map).await? {
|
||||||
Some(x) => x,
|
Some(x) => x,
|
||||||
None => return Ok(()), // it's a cancellation request
|
None => return Ok(()), // it's a cancellation request
|
||||||
@@ -99,12 +99,14 @@ async fn handle_client(
|
|||||||
/// we also take an extra care of propagating only the select handshake errors to client.
|
/// we also take an extra care of propagating only the select handshake errors to client.
|
||||||
async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||||
stream: S,
|
stream: S,
|
||||||
mut tls: Option<TlsConfig>,
|
mut tls: Option<&TlsConfig>,
|
||||||
cancel_map: &CancelMap,
|
cancel_map: &CancelMap,
|
||||||
) -> anyhow::Result<Option<(PqStream<Stream<S>>, auth::ClientCredentials)>> {
|
) -> anyhow::Result<Option<(PqStream<Stream<S>>, auth::ClientCredentials)>> {
|
||||||
// Client may try upgrading to each protocol only once
|
// Client may try upgrading to each protocol only once
|
||||||
let (mut tried_ssl, mut tried_gss) = (false, false);
|
let (mut tried_ssl, mut tried_gss) = (false, false);
|
||||||
|
|
||||||
|
let common_name = tls.and_then(|cfg| cfg.common_name.as_deref());
|
||||||
|
|
||||||
let mut stream = PqStream::new(Stream::from_raw(stream));
|
let mut stream = PqStream::new(Stream::from_raw(stream));
|
||||||
loop {
|
loop {
|
||||||
let msg = stream.read_startup_packet().await?;
|
let msg = stream.read_startup_packet().await?;
|
||||||
@@ -122,7 +124,9 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
|||||||
if let Some(tls) = tls.take() {
|
if let Some(tls) = tls.take() {
|
||||||
// Upgrade raw stream into a secure TLS-backed stream.
|
// Upgrade raw stream into a secure TLS-backed stream.
|
||||||
// NOTE: We've consumed `tls`; this fact will be used later.
|
// NOTE: We've consumed `tls`; this fact will be used later.
|
||||||
stream = PqStream::new(stream.into_inner().upgrade(tls).await?);
|
stream = PqStream::new(
|
||||||
|
stream.into_inner().upgrade(tls.to_server_config()).await?,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => bail!(ERR_PROTO_VIOLATION),
|
_ => bail!(ERR_PROTO_VIOLATION),
|
||||||
@@ -143,15 +147,16 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
|||||||
stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
|
stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Here and forth: `or_else` demands that we use a future here
|
// Get SNI info when available
|
||||||
let mut creds: auth::ClientCredentials = async { params.try_into() }
|
let sni_data = match stream.get_ref() {
|
||||||
.or_else(|e| stream.throw_error(e))
|
Stream::Tls { tls } => tls.get_ref().1.sni_hostname().map(|s| s.to_owned()),
|
||||||
.await?;
|
_ => None,
|
||||||
|
};
|
||||||
|
|
||||||
// Set SNI info when available
|
// Construct credentials
|
||||||
if let Stream::Tls { tls } = stream.get_ref() {
|
let creds =
|
||||||
creds.sni_data = tls.get_ref().1.sni_hostname().map(|s| s.to_owned());
|
auth::ClientCredentials::parse(params, sni_data.as_deref(), common_name);
|
||||||
}
|
let creds = async { creds }.or_else(|e| stream.throw_error(e)).await?;
|
||||||
|
|
||||||
break Ok(Some((stream, creds)));
|
break Ok(Some((stream, creds)));
|
||||||
}
|
}
|
||||||
@@ -264,12 +269,13 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Generate TLS certificates and build rustls configs for client and server.
|
/// Generate TLS certificates and build rustls configs for client and server.
|
||||||
fn generate_tls_config(
|
fn generate_tls_config<'a>(
|
||||||
hostname: &str,
|
hostname: &'a str,
|
||||||
) -> anyhow::Result<(ClientConfig<'_>, Arc<rustls::ServerConfig>)> {
|
common_name: &'a str,
|
||||||
|
) -> anyhow::Result<(ClientConfig<'a>, TlsConfig)> {
|
||||||
let (ca, cert, key) = generate_certs(hostname)?;
|
let (ca, cert, key) = generate_certs(hostname)?;
|
||||||
|
|
||||||
let server_config = {
|
let tls_config = {
|
||||||
let config = rustls::ServerConfig::builder()
|
let config = rustls::ServerConfig::builder()
|
||||||
.with_safe_defaults()
|
.with_safe_defaults()
|
||||||
.with_no_client_auth()
|
.with_no_client_auth()
|
||||||
@@ -291,7 +297,12 @@ mod tests {
|
|||||||
ClientConfig { config, hostname }
|
ClientConfig { config, hostname }
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok((client_config, server_config))
|
let tls_config = TlsConfig {
|
||||||
|
config: tls_config,
|
||||||
|
common_name: Some(common_name.to_string()),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok((client_config, tls_config))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
@@ -346,7 +357,7 @@ mod tests {
|
|||||||
auth: impl TestAuth + Send,
|
auth: impl TestAuth + Send,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let cancel_map = CancelMap::default();
|
let cancel_map = CancelMap::default();
|
||||||
let (mut stream, _creds) = handshake(client, tls, &cancel_map)
|
let (mut stream, _creds) = handshake(client, tls.as_ref(), &cancel_map)
|
||||||
.await?
|
.await?
|
||||||
.context("handshake failed")?;
|
.context("handshake failed")?;
|
||||||
|
|
||||||
@@ -365,7 +376,8 @@ mod tests {
|
|||||||
async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
|
async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
|
||||||
let (client, server) = tokio::io::duplex(1024);
|
let (client, server) = tokio::io::duplex(1024);
|
||||||
|
|
||||||
let (_, server_config) = generate_tls_config("localhost")?;
|
let (_, server_config) =
|
||||||
|
generate_tls_config("generic-project-name.localhost", "localhost")?;
|
||||||
let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
|
let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
|
||||||
|
|
||||||
let client_err = tokio_postgres::Config::new()
|
let client_err = tokio_postgres::Config::new()
|
||||||
@@ -393,7 +405,8 @@ mod tests {
|
|||||||
async fn handshake_tls() -> anyhow::Result<()> {
|
async fn handshake_tls() -> anyhow::Result<()> {
|
||||||
let (client, server) = tokio::io::duplex(1024);
|
let (client, server) = tokio::io::duplex(1024);
|
||||||
|
|
||||||
let (client_config, server_config) = generate_tls_config("localhost")?;
|
let (client_config, server_config) =
|
||||||
|
generate_tls_config("generic-project-name.localhost", "localhost")?;
|
||||||
let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
|
let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
|
||||||
|
|
||||||
let (_client, _conn) = tokio_postgres::Config::new()
|
let (_client, _conn) = tokio_postgres::Config::new()
|
||||||
@@ -415,6 +428,7 @@ mod tests {
|
|||||||
let (_client, _conn) = tokio_postgres::Config::new()
|
let (_client, _conn) = tokio_postgres::Config::new()
|
||||||
.user("john_doe")
|
.user("john_doe")
|
||||||
.dbname("earth")
|
.dbname("earth")
|
||||||
|
.options("project=generic-project-name")
|
||||||
.ssl_mode(SslMode::Prefer)
|
.ssl_mode(SslMode::Prefer)
|
||||||
.connect_raw(server, NoTls)
|
.connect_raw(server, NoTls)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -476,7 +490,8 @@ mod tests {
|
|||||||
async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
|
async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
|
||||||
let (client, server) = tokio::io::duplex(1024);
|
let (client, server) = tokio::io::duplex(1024);
|
||||||
|
|
||||||
let (client_config, server_config) = generate_tls_config("localhost")?;
|
let (client_config, server_config) =
|
||||||
|
generate_tls_config("generic-project-name.localhost", "localhost")?;
|
||||||
let proxy = tokio::spawn(dummy_proxy(
|
let proxy = tokio::spawn(dummy_proxy(
|
||||||
client,
|
client,
|
||||||
Some(server_config),
|
Some(server_config),
|
||||||
@@ -498,7 +513,8 @@ mod tests {
|
|||||||
async fn scram_auth_mock() -> anyhow::Result<()> {
|
async fn scram_auth_mock() -> anyhow::Result<()> {
|
||||||
let (client, server) = tokio::io::duplex(1024);
|
let (client, server) = tokio::io::duplex(1024);
|
||||||
|
|
||||||
let (client_config, server_config) = generate_tls_config("localhost")?;
|
let (client_config, server_config) =
|
||||||
|
generate_tls_config("generic-project-name.localhost", "localhost")?;
|
||||||
let proxy = tokio::spawn(dummy_proxy(
|
let proxy = tokio::spawn(dummy_proxy(
|
||||||
client,
|
client,
|
||||||
Some(server_config),
|
Some(server_config),
|
||||||
|
|||||||
@@ -115,7 +115,7 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
});
|
});
|
||||||
|
|
||||||
let () = waiter.await?;
|
waiter.await?;
|
||||||
notifier.await?
|
notifier.await?
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ use remote_storage::RemoteStorageConfig;
|
|||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{ErrorKind, Write};
|
use std::io::{ErrorKind, Write};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::sync::Arc;
|
||||||
use std::thread;
|
use std::thread;
|
||||||
use tokio::sync::mpsc;
|
use tokio::sync::mpsc;
|
||||||
use toml_edit::Document;
|
use toml_edit::Document;
|
||||||
@@ -27,6 +28,7 @@ use safekeeper::timeline::GlobalTimelines;
|
|||||||
use safekeeper::wal_backup;
|
use safekeeper::wal_backup;
|
||||||
use safekeeper::wal_service;
|
use safekeeper::wal_service;
|
||||||
use safekeeper::SafeKeeperConf;
|
use safekeeper::SafeKeeperConf;
|
||||||
|
use utils::auth::JwtAuth;
|
||||||
use utils::{
|
use utils::{
|
||||||
http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener,
|
http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener,
|
||||||
zid::NodeId,
|
zid::NodeId,
|
||||||
@@ -132,6 +134,12 @@ fn main() -> anyhow::Result<()> {
|
|||||||
.default_missing_value("true")
|
.default_missing_value("true")
|
||||||
.help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."),
|
.help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."),
|
||||||
)
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("auth-validation-public-key-path")
|
||||||
|
.long("auth-validation-public-key-path")
|
||||||
|
.takes_value(true)
|
||||||
|
.help("Path to an RSA .pem public key which is used to check JWT tokens")
|
||||||
|
)
|
||||||
.get_matches();
|
.get_matches();
|
||||||
|
|
||||||
if let Some(addr) = arg_matches.value_of("dump-control-file") {
|
if let Some(addr) = arg_matches.value_of("dump-control-file") {
|
||||||
@@ -204,6 +212,10 @@ fn main() -> anyhow::Result<()> {
|
|||||||
.parse()
|
.parse()
|
||||||
.context("failed to parse bool enable-s3-offload bool")?;
|
.context("failed to parse bool enable-s3-offload bool")?;
|
||||||
|
|
||||||
|
conf.auth_validation_public_key_path = arg_matches
|
||||||
|
.value_of("auth-validation-public-key-path")
|
||||||
|
.map(PathBuf::from);
|
||||||
|
|
||||||
start_safekeeper(conf, given_id, arg_matches.is_present("init"))
|
start_safekeeper(conf, given_id, arg_matches.is_present("init"))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -239,6 +251,19 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
|||||||
e
|
e
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
|
let auth = match conf.auth_validation_public_key_path.as_ref() {
|
||||||
|
None => {
|
||||||
|
info!("Auth is disabled");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
Some(path) => {
|
||||||
|
info!("Loading JWT auth key from {}", path.display());
|
||||||
|
Some(Arc::new(
|
||||||
|
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// XXX: Don't spawn any threads before daemonizing!
|
// XXX: Don't spawn any threads before daemonizing!
|
||||||
if conf.daemonize {
|
if conf.daemonize {
|
||||||
info!("daemonizing...");
|
info!("daemonizing...");
|
||||||
@@ -280,8 +305,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
|||||||
thread::Builder::new()
|
thread::Builder::new()
|
||||||
.name("http_endpoint_thread".into())
|
.name("http_endpoint_thread".into())
|
||||||
.spawn(|| {
|
.spawn(|| {
|
||||||
// TODO authentication
|
let router = http::make_router(conf_, auth);
|
||||||
let router = http::make_router(conf_);
|
|
||||||
endpoint::serve_thread_main(
|
endpoint::serve_thread_main(
|
||||||
router,
|
router,
|
||||||
http_listener,
|
http_listener,
|
||||||
@@ -295,6 +319,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
|||||||
let safekeeper_thread = thread::Builder::new()
|
let safekeeper_thread = thread::Builder::new()
|
||||||
.name("Safekeeper thread".into())
|
.name("Safekeeper thread".into())
|
||||||
.spawn(|| {
|
.spawn(|| {
|
||||||
|
// TODO: add auth
|
||||||
if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener) {
|
if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener) {
|
||||||
info!("safekeeper thread terminated: {e}");
|
info!("safekeeper thread terminated: {e}");
|
||||||
}
|
}
|
||||||
@@ -309,6 +334,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
|||||||
thread::Builder::new()
|
thread::Builder::new()
|
||||||
.name("broker thread".into())
|
.name("broker thread".into())
|
||||||
.spawn(|| {
|
.spawn(|| {
|
||||||
|
// TODO: add auth?
|
||||||
broker::thread_main(conf_);
|
broker::thread_main(conf_);
|
||||||
})?,
|
})?,
|
||||||
);
|
);
|
||||||
@@ -321,6 +347,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
|||||||
thread::Builder::new()
|
thread::Builder::new()
|
||||||
.name("WAL removal thread".into())
|
.name("WAL removal thread".into())
|
||||||
.spawn(|| {
|
.spawn(|| {
|
||||||
|
// TODO: add auth?
|
||||||
remove_wal::thread_main(conf_);
|
remove_wal::thread_main(conf_);
|
||||||
})?,
|
})?,
|
||||||
);
|
);
|
||||||
@@ -330,6 +357,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
|||||||
thread::Builder::new()
|
thread::Builder::new()
|
||||||
.name("wal backup launcher thread".into())
|
.name("wal backup launcher thread".into())
|
||||||
.spawn(move || {
|
.spawn(move || {
|
||||||
|
// TODO: add auth?
|
||||||
wal_backup::wal_backup_launcher_thread_main(conf_, wal_backup_launcher_rx);
|
wal_backup::wal_backup_launcher_thread_main(conf_, wal_backup_launcher_rx);
|
||||||
})?,
|
})?,
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -4,9 +4,12 @@ use anyhow::anyhow;
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use etcd_broker::Client;
|
use etcd_broker::subscription_value::SkTimelineInfo;
|
||||||
use etcd_broker::PutOptions;
|
use etcd_broker::LeaseKeepAliveStream;
|
||||||
use etcd_broker::SkTimelineSubscriptionKind;
|
use etcd_broker::LeaseKeeper;
|
||||||
|
|
||||||
|
use std::collections::hash_map::Entry;
|
||||||
|
use std::collections::HashMap;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tokio::spawn;
|
use tokio::spawn;
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
@@ -15,11 +18,15 @@ use tracing::*;
|
|||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
use crate::{timeline::GlobalTimelines, SafeKeeperConf};
|
use crate::{timeline::GlobalTimelines, SafeKeeperConf};
|
||||||
|
use etcd_broker::{
|
||||||
|
subscription_key::{OperationKind, SkOperationKind, SubscriptionKey},
|
||||||
|
Client, PutOptions,
|
||||||
|
};
|
||||||
use utils::zid::{NodeId, ZTenantTimelineId};
|
use utils::zid::{NodeId, ZTenantTimelineId};
|
||||||
|
|
||||||
const RETRY_INTERVAL_MSEC: u64 = 1000;
|
const RETRY_INTERVAL_MSEC: u64 = 1000;
|
||||||
const PUSH_INTERVAL_MSEC: u64 = 1000;
|
const PUSH_INTERVAL_MSEC: u64 = 1000;
|
||||||
const LEASE_TTL_SEC: i64 = 5;
|
const LEASE_TTL_SEC: i64 = 10;
|
||||||
|
|
||||||
pub fn thread_main(conf: SafeKeeperConf) {
|
pub fn thread_main(conf: SafeKeeperConf) {
|
||||||
let runtime = runtime::Builder::new_current_thread()
|
let runtime = runtime::Builder::new_current_thread()
|
||||||
@@ -43,7 +50,7 @@ fn timeline_safekeeper_path(
|
|||||||
) -> String {
|
) -> String {
|
||||||
format!(
|
format!(
|
||||||
"{}/{sk_id}",
|
"{}/{sk_id}",
|
||||||
SkTimelineSubscriptionKind::timeline(broker_etcd_prefix, zttid).watch_key()
|
SubscriptionKey::sk_timeline_info(broker_etcd_prefix, zttid).watch_key()
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -90,7 +97,7 @@ impl ElectionLeader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn get_leader(req: &Election) -> Result<ElectionLeader> {
|
pub async fn get_leader(req: &Election, leader: &mut Option<ElectionLeader>) -> Result<()> {
|
||||||
let mut client = Client::connect(req.broker_endpoints.clone(), None)
|
let mut client = Client::connect(req.broker_endpoints.clone(), None)
|
||||||
.await
|
.await
|
||||||
.context("Could not connect to etcd")?;
|
.context("Could not connect to etcd")?;
|
||||||
@@ -102,22 +109,27 @@ pub async fn get_leader(req: &Election) -> Result<ElectionLeader> {
|
|||||||
|
|
||||||
let lease_id = lease.map(|l| l.id()).unwrap();
|
let lease_id = lease.map(|l| l.id()).unwrap();
|
||||||
|
|
||||||
let keep_alive = spawn::<_>(lease_keep_alive(client.clone(), lease_id));
|
// kill previous keepalive, if any
|
||||||
|
if let Some(l) = leader.take() {
|
||||||
|
l.give_up().await;
|
||||||
|
}
|
||||||
|
|
||||||
if let Err(e) = client
|
let keep_alive = spawn::<_>(lease_keep_alive(client.clone(), lease_id));
|
||||||
|
// immediately save handle to kill task if we get canceled below
|
||||||
|
*leader = Some(ElectionLeader {
|
||||||
|
client: client.clone(),
|
||||||
|
keep_alive,
|
||||||
|
});
|
||||||
|
|
||||||
|
client
|
||||||
.campaign(
|
.campaign(
|
||||||
req.election_name.clone(),
|
req.election_name.clone(),
|
||||||
req.candidate_name.clone(),
|
req.candidate_name.clone(),
|
||||||
lease_id,
|
lease_id,
|
||||||
)
|
)
|
||||||
.await
|
.await?;
|
||||||
{
|
|
||||||
keep_alive.abort();
|
|
||||||
let _ = keep_alive.await;
|
|
||||||
return Err(e.into());
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(ElectionLeader { client, keep_alive })
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> {
|
async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> {
|
||||||
@@ -143,25 +155,52 @@ async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_campaign_name(
|
|
||||||
election_name: &str,
|
|
||||||
broker_prefix: &str,
|
|
||||||
id: ZTenantTimelineId,
|
|
||||||
) -> String {
|
|
||||||
format!("{broker_prefix}/{id}/{election_name}")
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_candiate_name(system_id: NodeId) -> String {
|
pub fn get_candiate_name(system_id: NodeId) -> String {
|
||||||
format!("id_{system_id}")
|
format!("id_{system_id}")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn push_sk_info(
|
||||||
|
zttid: ZTenantTimelineId,
|
||||||
|
mut client: Client,
|
||||||
|
key: String,
|
||||||
|
sk_info: SkTimelineInfo,
|
||||||
|
mut lease: Lease,
|
||||||
|
) -> anyhow::Result<(ZTenantTimelineId, Lease)> {
|
||||||
|
let put_opts = PutOptions::new().with_lease(lease.id);
|
||||||
|
client
|
||||||
|
.put(
|
||||||
|
key.clone(),
|
||||||
|
serde_json::to_string(&sk_info)?,
|
||||||
|
Some(put_opts),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("failed to push safekeeper info to {}", key))?;
|
||||||
|
|
||||||
|
// revive the lease
|
||||||
|
lease
|
||||||
|
.keeper
|
||||||
|
.keep_alive()
|
||||||
|
.await
|
||||||
|
.context("failed to send LeaseKeepAliveRequest")?;
|
||||||
|
lease
|
||||||
|
.ka_stream
|
||||||
|
.message()
|
||||||
|
.await
|
||||||
|
.context("failed to receive LeaseKeepAliveResponse")?;
|
||||||
|
|
||||||
|
Ok((zttid, lease))
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Lease {
|
||||||
|
id: i64,
|
||||||
|
keeper: LeaseKeeper,
|
||||||
|
ka_stream: LeaseKeepAliveStream,
|
||||||
|
}
|
||||||
|
|
||||||
/// Push once in a while data about all active timelines to the broker.
|
/// Push once in a while data about all active timelines to the broker.
|
||||||
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||||
let mut client = Client::connect(&conf.broker_endpoints, None).await?;
|
let mut client = Client::connect(&conf.broker_endpoints, None).await?;
|
||||||
|
let mut leases: HashMap<ZTenantTimelineId, Lease> = HashMap::new();
|
||||||
// Get and maintain lease to automatically delete obsolete data
|
|
||||||
let lease = client.lease_grant(LEASE_TTL_SEC, None).await?;
|
|
||||||
let (mut keeper, mut ka_stream) = client.lease_keep_alive(lease.id()).await?;
|
|
||||||
|
|
||||||
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
|
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
|
||||||
loop {
|
loop {
|
||||||
@@ -169,33 +208,46 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
|||||||
// is under plain mutex. That's ok, all this code is not performance
|
// is under plain mutex. That's ok, all this code is not performance
|
||||||
// sensitive and there is no risk of deadlock as we don't await while
|
// sensitive and there is no risk of deadlock as we don't await while
|
||||||
// lock is held.
|
// lock is held.
|
||||||
for zttid in GlobalTimelines::get_active_timelines() {
|
let active_tlis = GlobalTimelines::get_active_timelines();
|
||||||
if let Some(tli) = GlobalTimelines::get_loaded(zttid) {
|
|
||||||
let sk_info = tli.get_public_info(&conf)?;
|
// // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data.
|
||||||
let put_opts = PutOptions::new().with_lease(lease.id());
|
for zttid in active_tlis.iter() {
|
||||||
client
|
if let Entry::Vacant(v) = leases.entry(*zttid) {
|
||||||
.put(
|
let lease = client.lease_grant(LEASE_TTL_SEC, None).await?;
|
||||||
timeline_safekeeper_path(
|
let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?;
|
||||||
conf.broker_etcd_prefix.clone(),
|
v.insert(Lease {
|
||||||
zttid,
|
id: lease.id(),
|
||||||
conf.my_id,
|
keeper,
|
||||||
),
|
ka_stream,
|
||||||
serde_json::to_string(&sk_info)?,
|
});
|
||||||
Some(put_opts),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.context("failed to push safekeeper info")?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// revive the lease
|
leases.retain(|zttid, _| active_tlis.contains(zttid));
|
||||||
keeper
|
|
||||||
.keep_alive()
|
// Push data concurrently to not suffer from latency, with many timelines it can be slow.
|
||||||
.await
|
let handles = active_tlis
|
||||||
.context("failed to send LeaseKeepAliveRequest")?;
|
.iter()
|
||||||
ka_stream
|
.filter_map(|zttid| GlobalTimelines::get_loaded(*zttid))
|
||||||
.message()
|
.map(|tli| {
|
||||||
.await
|
let sk_info = tli.get_public_info(&conf);
|
||||||
.context("failed to receive LeaseKeepAliveResponse")?;
|
let key = timeline_safekeeper_path(
|
||||||
|
conf.broker_etcd_prefix.clone(),
|
||||||
|
tli.zttid,
|
||||||
|
conf.my_id,
|
||||||
|
);
|
||||||
|
let lease = leases.remove(&tli.zttid).unwrap();
|
||||||
|
tokio::spawn(push_sk_info(tli.zttid, client.clone(), key, sk_info, lease))
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
for h in handles {
|
||||||
|
let (zttid, lease) = h.await??;
|
||||||
|
// It is ugly to pull leases from hash and then put it back, but
|
||||||
|
// otherwise we have to resort to long living per tli tasks (which
|
||||||
|
// would generate a lot of errors when etcd is down) as task wants to
|
||||||
|
// have 'static objects, we can't borrow to it.
|
||||||
|
leases.insert(zttid, lease);
|
||||||
|
}
|
||||||
|
|
||||||
sleep(push_interval).await;
|
sleep(push_interval).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -204,22 +256,30 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
|||||||
async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
|
async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
|
||||||
let mut client = Client::connect(&conf.broker_endpoints, None).await?;
|
let mut client = Client::connect(&conf.broker_endpoints, None).await?;
|
||||||
|
|
||||||
let mut subscription = etcd_broker::subscribe_to_safekeeper_timeline_updates(
|
let mut subscription = etcd_broker::subscribe_for_values(
|
||||||
&mut client,
|
&mut client,
|
||||||
SkTimelineSubscriptionKind::all(conf.broker_etcd_prefix.clone()),
|
SubscriptionKey::all(conf.broker_etcd_prefix.clone()),
|
||||||
|
|full_key, value_str| {
|
||||||
|
if full_key.operation == OperationKind::Safekeeper(SkOperationKind::TimelineInfo) {
|
||||||
|
match serde_json::from_str::<SkTimelineInfo>(value_str) {
|
||||||
|
Ok(new_info) => return Some(new_info),
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to parse timeline info from value str '{value_str}': {e}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
},
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.context("failed to subscribe for safekeeper info")?;
|
.context("failed to subscribe for safekeeper info")?;
|
||||||
loop {
|
loop {
|
||||||
match subscription.fetch_data().await {
|
match subscription.value_updates.recv().await {
|
||||||
Some(new_info) => {
|
Some(new_info) => {
|
||||||
for (zttid, sk_info) in new_info {
|
// note: there are blocking operations below, but it's considered fine for now
|
||||||
// note: there are blocking operations below, but it's considered fine for now
|
if let Ok(tli) = GlobalTimelines::get(&conf, new_info.key.id, false) {
|
||||||
if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) {
|
tli.record_safekeeper_info(&new_info.value, new_info.key.node_id)
|
||||||
for (safekeeper_id, info) in sk_info {
|
.await?
|
||||||
tli.record_safekeeper_info(&info, safekeeper_id).await?
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
|
|||||||
@@ -239,6 +239,19 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
|||||||
remote_consistent_lsn: Lsn(0),
|
remote_consistent_lsn: Lsn(0),
|
||||||
peers: Peers(vec![]),
|
peers: Peers(vec![]),
|
||||||
});
|
});
|
||||||
|
} else if version == 5 {
|
||||||
|
info!("reading safekeeper control file version {}", version);
|
||||||
|
let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?;
|
||||||
|
if oldstate.timeline_start_lsn != Lsn(0) {
|
||||||
|
return Ok(oldstate);
|
||||||
|
}
|
||||||
|
|
||||||
|
// set special timeline_start_lsn because we don't know the real one
|
||||||
|
info!("setting timeline_start_lsn and local_start_lsn to Lsn(1)");
|
||||||
|
oldstate.timeline_start_lsn = Lsn(1);
|
||||||
|
oldstate.local_start_lsn = Lsn(1);
|
||||||
|
|
||||||
|
return Ok(oldstate);
|
||||||
}
|
}
|
||||||
bail!("unsupported safekeeper control file version {}", version)
|
bail!("unsupported safekeeper control file version {}", version)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
use etcd_broker::SkTimelineInfo;
|
use hyper::{Body, Request, Response, StatusCode, Uri};
|
||||||
use hyper::{Body, Request, Response, StatusCode};
|
|
||||||
|
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde::Serializer;
|
use serde::Serializer;
|
||||||
use std::collections::HashMap;
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::fmt::Display;
|
use std::fmt::Display;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
@@ -11,9 +11,11 @@ use crate::safekeeper::Term;
|
|||||||
use crate::safekeeper::TermHistory;
|
use crate::safekeeper::TermHistory;
|
||||||
use crate::timeline::{GlobalTimelines, TimelineDeleteForceResult};
|
use crate::timeline::{GlobalTimelines, TimelineDeleteForceResult};
|
||||||
use crate::SafeKeeperConf;
|
use crate::SafeKeeperConf;
|
||||||
|
use etcd_broker::subscription_value::SkTimelineInfo;
|
||||||
use utils::{
|
use utils::{
|
||||||
|
auth::JwtAuth,
|
||||||
http::{
|
http::{
|
||||||
endpoint,
|
endpoint::{self, auth_middleware, check_permission},
|
||||||
error::ApiError,
|
error::ApiError,
|
||||||
json::{json_request, json_response},
|
json::{json_request, json_response},
|
||||||
request::{ensure_no_body, parse_request_param},
|
request::{ensure_no_body, parse_request_param},
|
||||||
@@ -32,6 +34,7 @@ struct SafekeeperStatus {
|
|||||||
|
|
||||||
/// Healthcheck handler.
|
/// Healthcheck handler.
|
||||||
async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
check_permission(&request, None)?;
|
||||||
let conf = get_conf(&request);
|
let conf = get_conf(&request);
|
||||||
let status = SafekeeperStatus { id: conf.my_id };
|
let status = SafekeeperStatus { id: conf.my_id };
|
||||||
json_response(StatusCode::OK, status)
|
json_response(StatusCode::OK, status)
|
||||||
@@ -91,6 +94,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
|||||||
parse_request_param(&request, "tenant_id")?,
|
parse_request_param(&request, "tenant_id")?,
|
||||||
parse_request_param(&request, "timeline_id")?,
|
parse_request_param(&request, "timeline_id")?,
|
||||||
);
|
);
|
||||||
|
check_permission(&request, Some(zttid.tenant_id))?;
|
||||||
|
|
||||||
let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?;
|
let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?;
|
||||||
let (inmem, state) = tli.get_state();
|
let (inmem, state) = tli.get_state();
|
||||||
@@ -125,6 +129,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
|||||||
tenant_id: request_data.tenant_id,
|
tenant_id: request_data.tenant_id,
|
||||||
timeline_id: request_data.timeline_id,
|
timeline_id: request_data.timeline_id,
|
||||||
};
|
};
|
||||||
|
check_permission(&request, Some(zttid.tenant_id))?;
|
||||||
GlobalTimelines::create(get_conf(&request), zttid, request_data.peer_ids)
|
GlobalTimelines::create(get_conf(&request), zttid, request_data.peer_ids)
|
||||||
.map_err(ApiError::from_err)?;
|
.map_err(ApiError::from_err)?;
|
||||||
|
|
||||||
@@ -145,6 +150,7 @@ async fn timeline_delete_force_handler(
|
|||||||
parse_request_param(&request, "tenant_id")?,
|
parse_request_param(&request, "tenant_id")?,
|
||||||
parse_request_param(&request, "timeline_id")?,
|
parse_request_param(&request, "timeline_id")?,
|
||||||
);
|
);
|
||||||
|
check_permission(&request, Some(zttid.tenant_id))?;
|
||||||
ensure_no_body(&mut request).await?;
|
ensure_no_body(&mut request).await?;
|
||||||
json_response(
|
json_response(
|
||||||
StatusCode::OK,
|
StatusCode::OK,
|
||||||
@@ -160,6 +166,7 @@ async fn tenant_delete_force_handler(
|
|||||||
mut request: Request<Body>,
|
mut request: Request<Body>,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id = parse_request_param(&request, "tenant_id")?;
|
let tenant_id = parse_request_param(&request, "tenant_id")?;
|
||||||
|
check_permission(&request, Some(tenant_id))?;
|
||||||
ensure_no_body(&mut request).await?;
|
ensure_no_body(&mut request).await?;
|
||||||
json_response(
|
json_response(
|
||||||
StatusCode::OK,
|
StatusCode::OK,
|
||||||
@@ -178,6 +185,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
|
|||||||
parse_request_param(&request, "tenant_id")?,
|
parse_request_param(&request, "tenant_id")?,
|
||||||
parse_request_param(&request, "timeline_id")?,
|
parse_request_param(&request, "timeline_id")?,
|
||||||
);
|
);
|
||||||
|
check_permission(&request, Some(zttid.tenant_id))?;
|
||||||
let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?;
|
let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?;
|
||||||
|
|
||||||
let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?;
|
let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?;
|
||||||
@@ -188,15 +196,33 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Safekeeper http router.
|
/// Safekeeper http router.
|
||||||
pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
|
pub fn make_router(
|
||||||
let router = endpoint::make_router();
|
conf: SafeKeeperConf,
|
||||||
|
auth: Option<Arc<JwtAuth>>,
|
||||||
|
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||||
|
let mut router = endpoint::make_router();
|
||||||
|
if auth.is_some() {
|
||||||
|
router = router.middleware(auth_middleware(|request| {
|
||||||
|
#[allow(clippy::mutable_key_type)]
|
||||||
|
static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
|
||||||
|
Lazy::new(|| ["/v1/status"].iter().map(|v| v.parse().unwrap()).collect());
|
||||||
|
if ALLOWLIST_ROUTES.contains(request.uri()) {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
// Option<Arc<JwtAuth>> is always provided as data below, hence unwrap().
|
||||||
|
request.data::<Option<Arc<JwtAuth>>>().unwrap().as_deref()
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
}
|
||||||
router
|
router
|
||||||
.data(Arc::new(conf))
|
.data(Arc::new(conf))
|
||||||
|
.data(auth)
|
||||||
.get("/v1/status", status_handler)
|
.get("/v1/status", status_handler)
|
||||||
.get(
|
.get(
|
||||||
"/v1/timeline/:tenant_id/:timeline_id",
|
"/v1/timeline/:tenant_id/:timeline_id",
|
||||||
timeline_status_handler,
|
timeline_status_handler,
|
||||||
)
|
)
|
||||||
|
// Will be used in the future instead of implicit timeline creation
|
||||||
.post("/v1/timeline", timeline_create_handler)
|
.post("/v1/timeline", timeline_create_handler)
|
||||||
.delete(
|
.delete(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||||
|
|||||||
@@ -124,7 +124,7 @@ fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: L
|
|||||||
term,
|
term,
|
||||||
start_streaming_at: lsn,
|
start_streaming_at: lsn,
|
||||||
term_history: history,
|
term_history: history,
|
||||||
timeline_start_lsn: Lsn(0),
|
timeline_start_lsn: lsn,
|
||||||
});
|
});
|
||||||
|
|
||||||
spg.timeline.get().process_msg(&proposer_elected_request)?;
|
spg.timeline.get().process_msg(&proposer_elected_request)?;
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ pub struct SafeKeeperConf {
|
|||||||
pub my_id: NodeId,
|
pub my_id: NodeId,
|
||||||
pub broker_endpoints: Vec<Url>,
|
pub broker_endpoints: Vec<Url>,
|
||||||
pub broker_etcd_prefix: String,
|
pub broker_etcd_prefix: String,
|
||||||
|
pub auth_validation_public_key_path: Option<PathBuf>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SafeKeeperConf {
|
impl SafeKeeperConf {
|
||||||
@@ -88,6 +89,7 @@ impl Default for SafeKeeperConf {
|
|||||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||||
backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
|
backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
|
||||||
wal_backup_enabled: true,
|
wal_backup_enabled: true,
|
||||||
|
auth_validation_public_key_path: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -242,9 +242,9 @@ impl Collector for TimelineCollector {
|
|||||||
let timeline_id = tli.zttid.timeline_id.to_string();
|
let timeline_id = tli.zttid.timeline_id.to_string();
|
||||||
let labels = &[tenant_id.as_str(), timeline_id.as_str()];
|
let labels = &[tenant_id.as_str(), timeline_id.as_str()];
|
||||||
|
|
||||||
let mut most_advanced: Option<utils::pq_proto::ZenithFeedback> = None;
|
let mut most_advanced: Option<utils::pq_proto::ReplicationFeedback> = None;
|
||||||
for replica in tli.replicas.iter() {
|
for replica in tli.replicas.iter() {
|
||||||
if let Some(replica_feedback) = replica.zenith_feedback {
|
if let Some(replica_feedback) = replica.pageserver_feedback {
|
||||||
if let Some(current) = most_advanced {
|
if let Some(current) = most_advanced {
|
||||||
if current.ps_writelsn < replica_feedback.ps_writelsn {
|
if current.ps_writelsn < replica_feedback.ps_writelsn {
|
||||||
most_advanced = Some(replica_feedback);
|
most_advanced = Some(replica_feedback);
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use anyhow::{bail, Context, Result};
|
|||||||
use byteorder::{LittleEndian, ReadBytesExt};
|
use byteorder::{LittleEndian, ReadBytesExt};
|
||||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||||
|
|
||||||
use etcd_broker::SkTimelineInfo;
|
use etcd_broker::subscription_value::SkTimelineInfo;
|
||||||
use postgres_ffi::xlog_utils::TimeLineID;
|
use postgres_ffi::xlog_utils::TimeLineID;
|
||||||
|
|
||||||
use postgres_ffi::xlog_utils::XLogSegNo;
|
use postgres_ffi::xlog_utils::XLogSegNo;
|
||||||
@@ -23,12 +23,12 @@ use postgres_ffi::xlog_utils::MAX_SEND_SIZE;
|
|||||||
use utils::{
|
use utils::{
|
||||||
bin_ser::LeSer,
|
bin_ser::LeSer,
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
pq_proto::{SystemId, ZenithFeedback},
|
pq_proto::{ReplicationFeedback, SystemId},
|
||||||
zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||||
pub const SK_FORMAT_VERSION: u32 = 5;
|
pub const SK_FORMAT_VERSION: u32 = 6;
|
||||||
const SK_PROTOCOL_VERSION: u32 = 2;
|
const SK_PROTOCOL_VERSION: u32 = 2;
|
||||||
const UNKNOWN_SERVER_VERSION: u32 = 0;
|
const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||||
|
|
||||||
@@ -348,7 +348,7 @@ pub struct AppendResponse {
|
|||||||
// a criterion for walproposer --sync mode exit
|
// a criterion for walproposer --sync mode exit
|
||||||
pub commit_lsn: Lsn,
|
pub commit_lsn: Lsn,
|
||||||
pub hs_feedback: HotStandbyFeedback,
|
pub hs_feedback: HotStandbyFeedback,
|
||||||
pub zenith_feedback: ZenithFeedback,
|
pub pageserver_feedback: ReplicationFeedback,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AppendResponse {
|
impl AppendResponse {
|
||||||
@@ -358,7 +358,7 @@ impl AppendResponse {
|
|||||||
flush_lsn: Lsn(0),
|
flush_lsn: Lsn(0),
|
||||||
commit_lsn: Lsn(0),
|
commit_lsn: Lsn(0),
|
||||||
hs_feedback: HotStandbyFeedback::empty(),
|
hs_feedback: HotStandbyFeedback::empty(),
|
||||||
zenith_feedback: ZenithFeedback::empty(),
|
pageserver_feedback: ReplicationFeedback::empty(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -476,7 +476,7 @@ impl AcceptorProposerMessage {
|
|||||||
buf.put_u64_le(msg.hs_feedback.xmin);
|
buf.put_u64_le(msg.hs_feedback.xmin);
|
||||||
buf.put_u64_le(msg.hs_feedback.catalog_xmin);
|
buf.put_u64_le(msg.hs_feedback.catalog_xmin);
|
||||||
|
|
||||||
msg.zenith_feedback.serialize(buf)?
|
msg.pageserver_feedback.serialize(buf)?
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -677,7 +677,7 @@ where
|
|||||||
commit_lsn: self.state.commit_lsn,
|
commit_lsn: self.state.commit_lsn,
|
||||||
// will be filled by the upper code to avoid bothering safekeeper
|
// will be filled by the upper code to avoid bothering safekeeper
|
||||||
hs_feedback: HotStandbyFeedback::empty(),
|
hs_feedback: HotStandbyFeedback::empty(),
|
||||||
zenith_feedback: ZenithFeedback::empty(),
|
pageserver_feedback: ReplicationFeedback::empty(),
|
||||||
};
|
};
|
||||||
trace!("formed AppendResponse {:?}", ar);
|
trace!("formed AppendResponse {:?}", ar);
|
||||||
ar
|
ar
|
||||||
|
|||||||
@@ -13,15 +13,17 @@ use serde::{Deserialize, Serialize};
|
|||||||
use std::cmp::min;
|
use std::cmp::min;
|
||||||
use std::net::Shutdown;
|
use std::net::Shutdown;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::thread::sleep;
|
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use std::{str, thread};
|
use std::{str, thread};
|
||||||
|
|
||||||
|
use tokio::sync::watch::Receiver;
|
||||||
|
use tokio::time::timeout;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::{
|
use utils::{
|
||||||
bin_ser::BeSer,
|
bin_ser::BeSer,
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
postgres_backend::PostgresBackend,
|
postgres_backend::PostgresBackend,
|
||||||
pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody, ZenithFeedback},
|
pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody},
|
||||||
sock_split::ReadStream,
|
sock_split::ReadStream,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -29,7 +31,7 @@ use utils::{
|
|||||||
const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h';
|
const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h';
|
||||||
const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r';
|
const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r';
|
||||||
// zenith extension of replication protocol
|
// zenith extension of replication protocol
|
||||||
const ZENITH_STATUS_UPDATE_TAG_BYTE: u8 = b'z';
|
const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z';
|
||||||
|
|
||||||
type FullTransactionId = u64;
|
type FullTransactionId = u64;
|
||||||
|
|
||||||
@@ -122,15 +124,15 @@ impl ReplicationConn {
|
|||||||
warn!("unexpected StandbyReply. Read-only postgres replicas are not supported in safekeepers yet.");
|
warn!("unexpected StandbyReply. Read-only postgres replicas are not supported in safekeepers yet.");
|
||||||
// timeline.update_replica_state(replica_id, Some(state));
|
// timeline.update_replica_state(replica_id, Some(state));
|
||||||
}
|
}
|
||||||
Some(ZENITH_STATUS_UPDATE_TAG_BYTE) => {
|
Some(NEON_STATUS_UPDATE_TAG_BYTE) => {
|
||||||
// Note: deserializing is on m[9..] because we skip the tag byte and len bytes.
|
// Note: deserializing is on m[9..] because we skip the tag byte and len bytes.
|
||||||
let buf = Bytes::copy_from_slice(&m[9..]);
|
let buf = Bytes::copy_from_slice(&m[9..]);
|
||||||
let reply = ZenithFeedback::parse(buf);
|
let reply = ReplicationFeedback::parse(buf);
|
||||||
|
|
||||||
trace!("ZenithFeedback is {:?}", reply);
|
trace!("ReplicationFeedback is {:?}", reply);
|
||||||
// Only pageserver sends ZenithFeedback, so set the flag.
|
// Only pageserver sends ReplicationFeedback, so set the flag.
|
||||||
// This replica is the source of information to resend to compute.
|
// This replica is the source of information to resend to compute.
|
||||||
state.zenith_feedback = Some(reply);
|
state.pageserver_feedback = Some(reply);
|
||||||
|
|
||||||
timeline.update_replica_state(replica_id, state);
|
timeline.update_replica_state(replica_id, state);
|
||||||
}
|
}
|
||||||
@@ -191,100 +193,142 @@ impl ReplicationConn {
|
|||||||
}
|
}
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let mut wal_seg_size: usize;
|
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||||
loop {
|
.enable_all()
|
||||||
wal_seg_size = spg.timeline.get().get_state().1.server.wal_seg_size as usize;
|
.build()?;
|
||||||
if wal_seg_size == 0 {
|
|
||||||
error!("Cannot start replication before connecting to wal_proposer");
|
runtime.block_on(async move {
|
||||||
sleep(Duration::from_secs(1));
|
let (_, persisted_state) = spg.timeline.get().get_state();
|
||||||
|
// add persisted_state.timeline_start_lsn == Lsn(0) check
|
||||||
|
if persisted_state.server.wal_seg_size == 0 {
|
||||||
|
bail!("Cannot start replication before connecting to walproposer");
|
||||||
|
}
|
||||||
|
|
||||||
|
let wal_end = spg.timeline.get().get_end_of_wal();
|
||||||
|
// Walproposer gets special handling: safekeeper must give proposer all
|
||||||
|
// local WAL till the end, whether committed or not (walproposer will
|
||||||
|
// hang otherwise). That's because walproposer runs the consensus and
|
||||||
|
// synchronizes safekeepers on the most advanced one.
|
||||||
|
//
|
||||||
|
// There is a small risk of this WAL getting concurrently garbaged if
|
||||||
|
// another compute rises which collects majority and starts fixing log
|
||||||
|
// on this safekeeper itself. That's ok as (old) proposer will never be
|
||||||
|
// able to commit such WAL.
|
||||||
|
let stop_pos: Option<Lsn> = if spg.appname == Some("wal_proposer_recovery".to_string())
|
||||||
|
{
|
||||||
|
Some(wal_end)
|
||||||
} else {
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
info!("Start replication from {:?} till {:?}", start_pos, stop_pos);
|
||||||
|
|
||||||
|
// switch to copy
|
||||||
|
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||||
|
|
||||||
|
let mut end_pos = Lsn(0);
|
||||||
|
|
||||||
|
let mut wal_reader = WalReader::new(
|
||||||
|
spg.conf.timeline_dir(&spg.timeline.get().zttid),
|
||||||
|
&persisted_state,
|
||||||
|
start_pos,
|
||||||
|
spg.conf.wal_backup_enabled,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
// buffer for wal sending, limited by MAX_SEND_SIZE
|
||||||
|
let mut send_buf = vec![0u8; MAX_SEND_SIZE];
|
||||||
|
|
||||||
|
// watcher for commit_lsn updates
|
||||||
|
let mut commit_lsn_watch_rx = spg.timeline.get().get_commit_lsn_watch_rx();
|
||||||
|
|
||||||
|
loop {
|
||||||
|
if let Some(stop_pos) = stop_pos {
|
||||||
|
if start_pos >= stop_pos {
|
||||||
|
break; /* recovery finished */
|
||||||
|
}
|
||||||
|
end_pos = stop_pos;
|
||||||
|
} else {
|
||||||
|
/* Wait until we have some data to stream */
|
||||||
|
let lsn = wait_for_lsn(&mut commit_lsn_watch_rx, start_pos).await?;
|
||||||
|
|
||||||
|
if let Some(lsn) = lsn {
|
||||||
|
end_pos = lsn;
|
||||||
|
} else {
|
||||||
|
// TODO: also check once in a while whether we are walsender
|
||||||
|
// to right pageserver.
|
||||||
|
if spg.timeline.get().stop_walsender(replica_id)? {
|
||||||
|
// Shut down, timeline is suspended.
|
||||||
|
// TODO create proper error type for this
|
||||||
|
bail!("end streaming to {:?}", spg.appname);
|
||||||
|
}
|
||||||
|
|
||||||
|
// timeout expired: request pageserver status
|
||||||
|
pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
|
||||||
|
sent_ptr: end_pos.0,
|
||||||
|
timestamp: get_current_timestamp(),
|
||||||
|
request_reply: true,
|
||||||
|
}))
|
||||||
|
.context("Failed to send KeepAlive message")?;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize;
|
||||||
|
let send_size = min(send_size, send_buf.len());
|
||||||
|
|
||||||
|
let send_buf = &mut send_buf[..send_size];
|
||||||
|
|
||||||
|
// read wal into buffer
|
||||||
|
let send_size = wal_reader.read(send_buf).await?;
|
||||||
|
let send_buf = &send_buf[..send_size];
|
||||||
|
|
||||||
|
// Write some data to the network socket.
|
||||||
|
pgb.write_message(&BeMessage::XLogData(XLogDataBody {
|
||||||
|
wal_start: start_pos.0,
|
||||||
|
wal_end: end_pos.0,
|
||||||
|
timestamp: get_current_timestamp(),
|
||||||
|
data: send_buf,
|
||||||
|
}))
|
||||||
|
.context("Failed to send XLogData")?;
|
||||||
|
|
||||||
|
start_pos += send_size as u64;
|
||||||
|
trace!("sent WAL up to {}", start_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
|
||||||
|
|
||||||
|
// Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn.
|
||||||
|
async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> Result<Option<Lsn>> {
|
||||||
|
let commit_lsn: Lsn = *rx.borrow();
|
||||||
|
if commit_lsn > lsn {
|
||||||
|
return Ok(Some(commit_lsn));
|
||||||
|
}
|
||||||
|
|
||||||
|
let res = timeout(POLL_STATE_TIMEOUT, async move {
|
||||||
|
let mut commit_lsn;
|
||||||
|
loop {
|
||||||
|
rx.changed().await?;
|
||||||
|
commit_lsn = *rx.borrow();
|
||||||
|
if commit_lsn > lsn {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let wal_end = spg.timeline.get().get_end_of_wal();
|
|
||||||
// Walproposer gets special handling: safekeeper must give proposer all
|
|
||||||
// local WAL till the end, whether committed or not (walproposer will
|
|
||||||
// hang otherwise). That's because walproposer runs the consensus and
|
|
||||||
// synchronizes safekeepers on the most advanced one.
|
|
||||||
//
|
|
||||||
// There is a small risk of this WAL getting concurrently garbaged if
|
|
||||||
// another compute rises which collects majority and starts fixing log
|
|
||||||
// on this safekeeper itself. That's ok as (old) proposer will never be
|
|
||||||
// able to commit such WAL.
|
|
||||||
let stop_pos: Option<Lsn> = if spg.appname == Some("wal_proposer_recovery".to_string()) {
|
|
||||||
Some(wal_end)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
info!("Start replication from {:?} till {:?}", start_pos, stop_pos);
|
|
||||||
|
|
||||||
// switch to copy
|
Ok(commit_lsn)
|
||||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
let mut end_pos = Lsn(0);
|
match res {
|
||||||
|
// success
|
||||||
let mut wal_reader = WalReader::new(
|
Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)),
|
||||||
spg.conf.timeline_dir(&spg.timeline.get().zttid),
|
// error inside closure
|
||||||
wal_seg_size,
|
Ok(Err(err)) => Err(err),
|
||||||
start_pos,
|
// timeout
|
||||||
);
|
Err(_) => Ok(None),
|
||||||
|
|
||||||
// buffer for wal sending, limited by MAX_SEND_SIZE
|
|
||||||
let mut send_buf = vec![0u8; MAX_SEND_SIZE];
|
|
||||||
|
|
||||||
loop {
|
|
||||||
if let Some(stop_pos) = stop_pos {
|
|
||||||
if start_pos >= stop_pos {
|
|
||||||
break; /* recovery finished */
|
|
||||||
}
|
|
||||||
end_pos = stop_pos;
|
|
||||||
} else {
|
|
||||||
/* Wait until we have some data to stream */
|
|
||||||
let lsn = spg.timeline.get().wait_for_lsn(start_pos);
|
|
||||||
|
|
||||||
if let Some(lsn) = lsn {
|
|
||||||
end_pos = lsn;
|
|
||||||
} else {
|
|
||||||
// TODO: also check once in a while whether we are walsender
|
|
||||||
// to right pageserver.
|
|
||||||
if spg.timeline.get().stop_walsender(replica_id)? {
|
|
||||||
// Shut down, timeline is suspended.
|
|
||||||
// TODO create proper error type for this
|
|
||||||
bail!("end streaming to {:?}", spg.appname);
|
|
||||||
}
|
|
||||||
|
|
||||||
// timeout expired: request pageserver status
|
|
||||||
pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
|
|
||||||
sent_ptr: end_pos.0,
|
|
||||||
timestamp: get_current_timestamp(),
|
|
||||||
request_reply: true,
|
|
||||||
}))
|
|
||||||
.context("Failed to send KeepAlive message")?;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize;
|
|
||||||
let send_size = min(send_size, send_buf.len());
|
|
||||||
|
|
||||||
let send_buf = &mut send_buf[..send_size];
|
|
||||||
|
|
||||||
// read wal into buffer
|
|
||||||
let send_size = wal_reader.read(send_buf)?;
|
|
||||||
let send_buf = &send_buf[..send_size];
|
|
||||||
|
|
||||||
// Write some data to the network socket.
|
|
||||||
pgb.write_message(&BeMessage::XLogData(XLogDataBody {
|
|
||||||
wal_start: start_pos.0,
|
|
||||||
wal_end: end_pos.0,
|
|
||||||
timestamp: get_current_timestamp(),
|
|
||||||
data: send_buf,
|
|
||||||
}))
|
|
||||||
.context("Failed to send XLogData")?;
|
|
||||||
|
|
||||||
start_pos += send_size as u64;
|
|
||||||
trace!("sent WAL up to {}", start_pos);
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
use anyhow::{bail, Context, Result};
|
use anyhow::{bail, Context, Result};
|
||||||
|
|
||||||
use etcd_broker::SkTimelineInfo;
|
use etcd_broker::subscription_value::SkTimelineInfo;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use postgres_ffi::xlog_utils::XLogSegNo;
|
use postgres_ffi::xlog_utils::XLogSegNo;
|
||||||
|
|
||||||
@@ -11,17 +11,17 @@ use serde::Serialize;
|
|||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
|
|
||||||
use std::cmp::{max, min};
|
use std::cmp::{max, min};
|
||||||
use std::collections::HashMap;
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::fs::{self};
|
use std::fs::{self};
|
||||||
|
|
||||||
use std::sync::{Arc, Condvar, Mutex, MutexGuard};
|
use std::sync::{Arc, Mutex, MutexGuard};
|
||||||
use std::time::Duration;
|
|
||||||
use tokio::sync::mpsc::Sender;
|
use tokio::sync::mpsc::Sender;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
use utils::{
|
use utils::{
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
pq_proto::ZenithFeedback,
|
pq_proto::ReplicationFeedback,
|
||||||
zid::{NodeId, ZTenantId, ZTenantTimelineId},
|
zid::{NodeId, ZTenantId, ZTenantTimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -37,8 +37,6 @@ use crate::wal_storage;
|
|||||||
use crate::wal_storage::Storage as wal_storage_iface;
|
use crate::wal_storage::Storage as wal_storage_iface;
|
||||||
use crate::SafeKeeperConf;
|
use crate::SafeKeeperConf;
|
||||||
|
|
||||||
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
|
|
||||||
|
|
||||||
/// Replica status update + hot standby feedback
|
/// Replica status update + hot standby feedback
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct ReplicaState {
|
pub struct ReplicaState {
|
||||||
@@ -48,8 +46,8 @@ pub struct ReplicaState {
|
|||||||
pub remote_consistent_lsn: Lsn,
|
pub remote_consistent_lsn: Lsn,
|
||||||
/// combined hot standby feedback from all replicas
|
/// combined hot standby feedback from all replicas
|
||||||
pub hs_feedback: HotStandbyFeedback,
|
pub hs_feedback: HotStandbyFeedback,
|
||||||
/// Zenith specific feedback received from pageserver, if any
|
/// Replication specific feedback received from pageserver, if any
|
||||||
pub zenith_feedback: Option<ZenithFeedback>,
|
pub pageserver_feedback: Option<ReplicationFeedback>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for ReplicaState {
|
impl Default for ReplicaState {
|
||||||
@@ -68,7 +66,7 @@ impl ReplicaState {
|
|||||||
xmin: u64::MAX,
|
xmin: u64::MAX,
|
||||||
catalog_xmin: u64::MAX,
|
catalog_xmin: u64::MAX,
|
||||||
},
|
},
|
||||||
zenith_feedback: None,
|
pageserver_feedback: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -77,9 +75,6 @@ impl ReplicaState {
|
|||||||
struct SharedState {
|
struct SharedState {
|
||||||
/// Safekeeper object
|
/// Safekeeper object
|
||||||
sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
|
sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
|
||||||
/// For receiving-sending wal cooperation
|
|
||||||
/// quorum commit LSN we've notified walsenders about
|
|
||||||
notified_commit_lsn: Lsn,
|
|
||||||
/// State of replicas
|
/// State of replicas
|
||||||
replicas: Vec<Option<ReplicaState>>,
|
replicas: Vec<Option<ReplicaState>>,
|
||||||
/// True when WAL backup launcher oversees the timeline, making sure WAL is
|
/// True when WAL backup launcher oversees the timeline, making sure WAL is
|
||||||
@@ -112,7 +107,6 @@ impl SharedState {
|
|||||||
let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?;
|
let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?;
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
notified_commit_lsn: Lsn(0),
|
|
||||||
sk,
|
sk,
|
||||||
replicas: Vec::new(),
|
replicas: Vec::new(),
|
||||||
wal_backup_active: false,
|
wal_backup_active: false,
|
||||||
@@ -131,7 +125,6 @@ impl SharedState {
|
|||||||
info!("timeline {} restored", zttid.timeline_id);
|
info!("timeline {} restored", zttid.timeline_id);
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
notified_commit_lsn: Lsn(0),
|
|
||||||
sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?,
|
sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?,
|
||||||
replicas: Vec::new(),
|
replicas: Vec::new(),
|
||||||
wal_backup_active: false,
|
wal_backup_active: false,
|
||||||
@@ -149,8 +142,12 @@ impl SharedState {
|
|||||||
|
|
||||||
/// Mark timeline active/inactive and return whether s3 offloading requires
|
/// Mark timeline active/inactive and return whether s3 offloading requires
|
||||||
/// start/stop action.
|
/// start/stop action.
|
||||||
fn update_status(&mut self) -> bool {
|
fn update_status(&mut self, ttid: ZTenantTimelineId) -> bool {
|
||||||
self.active = self.is_active();
|
let is_active = self.is_active();
|
||||||
|
if self.active != is_active {
|
||||||
|
info!("timeline {} active={} now", ttid, is_active);
|
||||||
|
}
|
||||||
|
self.active = is_active;
|
||||||
self.is_wal_backup_action_pending()
|
self.is_wal_backup_action_pending()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -187,6 +184,12 @@ impl SharedState {
|
|||||||
self.wal_backup_active
|
self.wal_backup_active
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Can this safekeeper offload to s3? Recently joined safekeepers might not
|
||||||
|
// have necessary WAL.
|
||||||
|
fn can_wal_backup(&self) -> bool {
|
||||||
|
self.sk.state.local_start_lsn <= self.sk.inmem.backup_lsn
|
||||||
|
}
|
||||||
|
|
||||||
fn get_wal_seg_size(&self) -> usize {
|
fn get_wal_seg_size(&self) -> usize {
|
||||||
self.sk.state.server.wal_seg_size as usize
|
self.sk.state.server.wal_seg_size as usize
|
||||||
}
|
}
|
||||||
@@ -211,25 +214,25 @@ impl SharedState {
|
|||||||
// we need to know which pageserver compute node considers to be main.
|
// we need to know which pageserver compute node considers to be main.
|
||||||
// See https://github.com/zenithdb/zenith/issues/1171
|
// See https://github.com/zenithdb/zenith/issues/1171
|
||||||
//
|
//
|
||||||
if let Some(zenith_feedback) = state.zenith_feedback {
|
if let Some(pageserver_feedback) = state.pageserver_feedback {
|
||||||
if let Some(acc_feedback) = acc.zenith_feedback {
|
if let Some(acc_feedback) = acc.pageserver_feedback {
|
||||||
if acc_feedback.ps_writelsn < zenith_feedback.ps_writelsn {
|
if acc_feedback.ps_writelsn < pageserver_feedback.ps_writelsn {
|
||||||
warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet.");
|
warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet.");
|
||||||
acc.zenith_feedback = Some(zenith_feedback);
|
acc.pageserver_feedback = Some(pageserver_feedback);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
acc.zenith_feedback = Some(zenith_feedback);
|
acc.pageserver_feedback = Some(pageserver_feedback);
|
||||||
}
|
}
|
||||||
|
|
||||||
// last lsn received by pageserver
|
// last lsn received by pageserver
|
||||||
// FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver.
|
// FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver.
|
||||||
// See https://github.com/zenithdb/zenith/issues/1171
|
// See https://github.com/zenithdb/zenith/issues/1171
|
||||||
acc.last_received_lsn = Lsn::from(zenith_feedback.ps_writelsn);
|
acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn);
|
||||||
|
|
||||||
// When at least one pageserver has preserved data up to remote_consistent_lsn,
|
// When at least one pageserver has preserved data up to remote_consistent_lsn,
|
||||||
// safekeeper is free to delete it, so choose max of all pageservers.
|
// safekeeper is free to delete it, so choose max of all pageservers.
|
||||||
acc.remote_consistent_lsn = max(
|
acc.remote_consistent_lsn = max(
|
||||||
Lsn::from(zenith_feedback.ps_applylsn),
|
Lsn::from(pageserver_feedback.ps_applylsn),
|
||||||
acc.remote_consistent_lsn,
|
acc.remote_consistent_lsn,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -261,8 +264,6 @@ pub struct Timeline {
|
|||||||
/// For breeding receivers.
|
/// For breeding receivers.
|
||||||
commit_lsn_watch_rx: watch::Receiver<Lsn>,
|
commit_lsn_watch_rx: watch::Receiver<Lsn>,
|
||||||
mutex: Mutex<SharedState>,
|
mutex: Mutex<SharedState>,
|
||||||
/// conditional variable used to notify wal senders
|
|
||||||
cond: Condvar,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Timeline {
|
impl Timeline {
|
||||||
@@ -279,7 +280,6 @@ impl Timeline {
|
|||||||
commit_lsn_watch_tx,
|
commit_lsn_watch_tx,
|
||||||
commit_lsn_watch_rx,
|
commit_lsn_watch_rx,
|
||||||
mutex: Mutex::new(shared_state),
|
mutex: Mutex::new(shared_state),
|
||||||
cond: Condvar::new(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -291,7 +291,7 @@ impl Timeline {
|
|||||||
{
|
{
|
||||||
let mut shared_state = self.mutex.lock().unwrap();
|
let mut shared_state = self.mutex.lock().unwrap();
|
||||||
shared_state.num_computes += 1;
|
shared_state.num_computes += 1;
|
||||||
is_wal_backup_action_pending = shared_state.update_status();
|
is_wal_backup_action_pending = shared_state.update_status(self.zttid);
|
||||||
}
|
}
|
||||||
// Wake up wal backup launcher, if offloading not started yet.
|
// Wake up wal backup launcher, if offloading not started yet.
|
||||||
if is_wal_backup_action_pending {
|
if is_wal_backup_action_pending {
|
||||||
@@ -308,7 +308,7 @@ impl Timeline {
|
|||||||
{
|
{
|
||||||
let mut shared_state = self.mutex.lock().unwrap();
|
let mut shared_state = self.mutex.lock().unwrap();
|
||||||
shared_state.num_computes -= 1;
|
shared_state.num_computes -= 1;
|
||||||
is_wal_backup_action_pending = shared_state.update_status();
|
is_wal_backup_action_pending = shared_state.update_status(self.zttid);
|
||||||
}
|
}
|
||||||
// Wake up wal backup launcher, if it is time to stop the offloading.
|
// Wake up wal backup launcher, if it is time to stop the offloading.
|
||||||
if is_wal_backup_action_pending {
|
if is_wal_backup_action_pending {
|
||||||
@@ -323,11 +323,11 @@ impl Timeline {
|
|||||||
let mut shared_state = self.mutex.lock().unwrap();
|
let mut shared_state = self.mutex.lock().unwrap();
|
||||||
if shared_state.num_computes == 0 {
|
if shared_state.num_computes == 0 {
|
||||||
let replica_state = shared_state.replicas[replica_id].unwrap();
|
let replica_state = shared_state.replicas[replica_id].unwrap();
|
||||||
let stop = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet
|
let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
|
||||||
(replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet.
|
(replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet.
|
||||||
replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn);
|
replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn);
|
||||||
if stop {
|
if stop {
|
||||||
shared_state.update_status();
|
shared_state.update_status(self.zttid);
|
||||||
return Ok(true);
|
return Ok(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -341,6 +341,12 @@ impl Timeline {
|
|||||||
shared_state.wal_backup_attend()
|
shared_state.wal_backup_attend()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Can this safekeeper offload to s3? Recently joined safekeepers might not
|
||||||
|
// have necessary WAL.
|
||||||
|
pub fn can_wal_backup(&self) -> bool {
|
||||||
|
self.mutex.lock().unwrap().can_wal_backup()
|
||||||
|
}
|
||||||
|
|
||||||
/// Deactivates the timeline, assuming it is being deleted.
|
/// Deactivates the timeline, assuming it is being deleted.
|
||||||
/// Returns whether the timeline was already active.
|
/// Returns whether the timeline was already active.
|
||||||
///
|
///
|
||||||
@@ -389,39 +395,6 @@ impl Timeline {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Timed wait for an LSN to be committed.
|
|
||||||
///
|
|
||||||
/// Returns the last committed LSN, which will be at least
|
|
||||||
/// as high as the LSN waited for, or None if timeout expired.
|
|
||||||
///
|
|
||||||
pub fn wait_for_lsn(&self, lsn: Lsn) -> Option<Lsn> {
|
|
||||||
let mut shared_state = self.mutex.lock().unwrap();
|
|
||||||
loop {
|
|
||||||
let commit_lsn = shared_state.notified_commit_lsn;
|
|
||||||
// This must be `>`, not `>=`.
|
|
||||||
if commit_lsn > lsn {
|
|
||||||
return Some(commit_lsn);
|
|
||||||
}
|
|
||||||
let result = self
|
|
||||||
.cond
|
|
||||||
.wait_timeout(shared_state, POLL_STATE_TIMEOUT)
|
|
||||||
.unwrap();
|
|
||||||
if result.1.timed_out() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
shared_state = result.0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Notify caught-up WAL senders about new WAL data received
|
|
||||||
// TODO: replace-unify it with commit_lsn_watch.
|
|
||||||
fn notify_wal_senders(&self, shared_state: &mut MutexGuard<SharedState>) {
|
|
||||||
if shared_state.notified_commit_lsn < shared_state.sk.inmem.commit_lsn {
|
|
||||||
shared_state.notified_commit_lsn = shared_state.sk.inmem.commit_lsn;
|
|
||||||
self.cond.notify_all();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
|
pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
|
||||||
self.commit_lsn_watch_rx.clone()
|
self.commit_lsn_watch_rx.clone()
|
||||||
}
|
}
|
||||||
@@ -441,13 +414,11 @@ impl Timeline {
|
|||||||
if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
|
if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
|
||||||
let state = shared_state.get_replicas_state();
|
let state = shared_state.get_replicas_state();
|
||||||
resp.hs_feedback = state.hs_feedback;
|
resp.hs_feedback = state.hs_feedback;
|
||||||
if let Some(zenith_feedback) = state.zenith_feedback {
|
if let Some(pageserver_feedback) = state.pageserver_feedback {
|
||||||
resp.zenith_feedback = zenith_feedback;
|
resp.pageserver_feedback = pageserver_feedback;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ping wal sender that new data might be available.
|
|
||||||
self.notify_wal_senders(&mut shared_state);
|
|
||||||
commit_lsn = shared_state.sk.inmem.commit_lsn;
|
commit_lsn = shared_state.sk.inmem.commit_lsn;
|
||||||
}
|
}
|
||||||
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
||||||
@@ -474,9 +445,9 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Prepare public safekeeper info for reporting.
|
/// Prepare public safekeeper info for reporting.
|
||||||
pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result<SkTimelineInfo> {
|
pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
|
||||||
let shared_state = self.mutex.lock().unwrap();
|
let shared_state = self.mutex.lock().unwrap();
|
||||||
Ok(SkTimelineInfo {
|
SkTimelineInfo {
|
||||||
last_log_term: Some(shared_state.sk.get_epoch()),
|
last_log_term: Some(shared_state.sk.get_epoch()),
|
||||||
flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()),
|
flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()),
|
||||||
// note: this value is not flushed to control file yet and can be lost
|
// note: this value is not flushed to control file yet and can be lost
|
||||||
@@ -489,7 +460,7 @@ impl Timeline {
|
|||||||
peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn),
|
peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn),
|
||||||
safekeeper_connstr: Some(conf.listen_pg_addr.clone()),
|
safekeeper_connstr: Some(conf.listen_pg_addr.clone()),
|
||||||
backup_lsn: Some(shared_state.sk.inmem.backup_lsn),
|
backup_lsn: Some(shared_state.sk.inmem.backup_lsn),
|
||||||
})
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Update timeline state with peer safekeeper data.
|
/// Update timeline state with peer safekeeper data.
|
||||||
@@ -508,8 +479,7 @@ impl Timeline {
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
shared_state.sk.record_safekeeper_info(sk_info)?;
|
shared_state.sk.record_safekeeper_info(sk_info)?;
|
||||||
self.notify_wal_senders(&mut shared_state);
|
is_wal_backup_action_pending = shared_state.update_status(self.zttid);
|
||||||
is_wal_backup_action_pending = shared_state.update_status();
|
|
||||||
commit_lsn = shared_state.sk.inmem.commit_lsn;
|
commit_lsn = shared_state.sk.inmem.commit_lsn;
|
||||||
}
|
}
|
||||||
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
||||||
@@ -655,6 +625,8 @@ impl GlobalTimelines {
|
|||||||
zttid: ZTenantTimelineId,
|
zttid: ZTenantTimelineId,
|
||||||
create: bool,
|
create: bool,
|
||||||
) -> Result<Arc<Timeline>> {
|
) -> Result<Arc<Timeline>> {
|
||||||
|
let _enter = info_span!("", timeline = %zttid.tenant_id).entered();
|
||||||
|
|
||||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||||
|
|
||||||
match state.timelines.get(&zttid) {
|
match state.timelines.get(&zttid) {
|
||||||
@@ -697,7 +669,7 @@ impl GlobalTimelines {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Get ZTenantTimelineIDs of all active timelines.
|
/// Get ZTenantTimelineIDs of all active timelines.
|
||||||
pub fn get_active_timelines() -> Vec<ZTenantTimelineId> {
|
pub fn get_active_timelines() -> HashSet<ZTenantTimelineId> {
|
||||||
let state = TIMELINES_STATE.lock().unwrap();
|
let state = TIMELINES_STATE.lock().unwrap();
|
||||||
state
|
state
|
||||||
.timelines
|
.timelines
|
||||||
|
|||||||
@@ -1,4 +1,8 @@
|
|||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
|
use etcd_broker::subscription_key::{
|
||||||
|
NodeKind, OperationKind, SkOperationKind, SubscriptionKey, SubscriptionKind,
|
||||||
|
};
|
||||||
|
use tokio::io::AsyncRead;
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
|
|
||||||
use std::cmp::min;
|
use std::cmp::min;
|
||||||
@@ -7,7 +11,9 @@ use std::path::{Path, PathBuf};
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI};
|
use postgres_ffi::xlog_utils::{
|
||||||
|
XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, MAX_SEND_SIZE, PG_TLI,
|
||||||
|
};
|
||||||
use remote_storage::{GenericRemoteStorage, RemoteStorage};
|
use remote_storage::{GenericRemoteStorage, RemoteStorage};
|
||||||
use tokio::fs::File;
|
use tokio::fs::File;
|
||||||
use tokio::runtime::Builder;
|
use tokio::runtime::Builder;
|
||||||
@@ -26,8 +32,6 @@ use crate::{broker, SafeKeeperConf};
|
|||||||
|
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
|
|
||||||
const BACKUP_ELECTION_NAME: &str = "WAL_BACKUP";
|
|
||||||
|
|
||||||
const BROKER_CONNECTION_RETRY_DELAY_MS: u64 = 1000;
|
const BROKER_CONNECTION_RETRY_DELAY_MS: u64 = 1000;
|
||||||
|
|
||||||
const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
|
const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
|
||||||
@@ -48,14 +52,10 @@ pub fn wal_backup_launcher_thread_main(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check whether wal backup is required for timeline and mark that launcher is
|
/// Check whether wal backup is required for timeline. If yes, mark that launcher is
|
||||||
/// aware of current status (if timeline exists).
|
/// aware of current status and return the timeline.
|
||||||
fn is_wal_backup_required(zttid: ZTenantTimelineId) -> bool {
|
fn is_wal_backup_required(zttid: ZTenantTimelineId) -> Option<Arc<Timeline>> {
|
||||||
if let Some(tli) = GlobalTimelines::get_loaded(zttid) {
|
GlobalTimelines::get_loaded(zttid).filter(|t| t.wal_backup_attend())
|
||||||
tli.wal_backup_attend()
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct WalBackupTaskHandle {
|
struct WalBackupTaskHandle {
|
||||||
@@ -63,6 +63,56 @@ struct WalBackupTaskHandle {
|
|||||||
handle: JoinHandle<()>,
|
handle: JoinHandle<()>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct WalBackupTimelineEntry {
|
||||||
|
timeline: Arc<Timeline>,
|
||||||
|
handle: Option<WalBackupTaskHandle>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Start per timeline task, if it makes sense for this safekeeper to offload.
|
||||||
|
fn consider_start_task(
|
||||||
|
conf: &SafeKeeperConf,
|
||||||
|
zttid: ZTenantTimelineId,
|
||||||
|
task: &mut WalBackupTimelineEntry,
|
||||||
|
) {
|
||||||
|
if !task.timeline.can_wal_backup() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
info!("starting WAL backup task for {}", zttid);
|
||||||
|
|
||||||
|
// TODO: decide who should offload right here by simply checking current
|
||||||
|
// state instead of running elections in offloading task.
|
||||||
|
let election_name = SubscriptionKey {
|
||||||
|
cluster_prefix: conf.broker_etcd_prefix.clone(),
|
||||||
|
kind: SubscriptionKind::Operation(
|
||||||
|
zttid,
|
||||||
|
NodeKind::Safekeeper,
|
||||||
|
OperationKind::Safekeeper(SkOperationKind::WalBackup),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
.watch_key();
|
||||||
|
let my_candidate_name = broker::get_candiate_name(conf.my_id);
|
||||||
|
let election = broker::Election::new(
|
||||||
|
election_name,
|
||||||
|
my_candidate_name,
|
||||||
|
conf.broker_endpoints.clone(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
|
||||||
|
let timeline_dir = conf.timeline_dir(&zttid);
|
||||||
|
|
||||||
|
let handle = tokio::spawn(
|
||||||
|
backup_task_main(zttid, timeline_dir, shutdown_rx, election)
|
||||||
|
.instrument(info_span!("WAL backup task", zttid = %zttid)),
|
||||||
|
);
|
||||||
|
|
||||||
|
task.handle = Some(WalBackupTaskHandle {
|
||||||
|
shutdown_tx,
|
||||||
|
handle,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
|
||||||
|
|
||||||
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
|
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
|
||||||
/// tasks. Having this in separate task simplifies locking, allows to reap
|
/// tasks. Having this in separate task simplifies locking, allows to reap
|
||||||
/// panics and separate elections from offloading itself.
|
/// panics and separate elections from offloading itself.
|
||||||
@@ -71,7 +121,7 @@ async fn wal_backup_launcher_main_loop(
|
|||||||
mut wal_backup_launcher_rx: Receiver<ZTenantTimelineId>,
|
mut wal_backup_launcher_rx: Receiver<ZTenantTimelineId>,
|
||||||
) {
|
) {
|
||||||
info!(
|
info!(
|
||||||
"WAL backup launcher: started, remote config {:?}",
|
"WAL backup launcher started, remote config {:?}",
|
||||||
conf.remote_storage
|
conf.remote_storage
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -82,60 +132,50 @@ async fn wal_backup_launcher_main_loop(
|
|||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
let mut tasks: HashMap<ZTenantTimelineId, WalBackupTaskHandle> = HashMap::new();
|
// Presense in this map means launcher is aware s3 offloading is needed for
|
||||||
|
// the timeline, but task is started only if it makes sense for to offload
|
||||||
|
// from this safekeeper.
|
||||||
|
let mut tasks: HashMap<ZTenantTimelineId, WalBackupTimelineEntry> = HashMap::new();
|
||||||
|
|
||||||
|
let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC));
|
||||||
loop {
|
loop {
|
||||||
// channel is never expected to get closed
|
tokio::select! {
|
||||||
let zttid = wal_backup_launcher_rx.recv().await.unwrap();
|
zttid = wal_backup_launcher_rx.recv() => {
|
||||||
let is_wal_backup_required = is_wal_backup_required(zttid);
|
// channel is never expected to get closed
|
||||||
if conf.remote_storage.is_none() || !conf.wal_backup_enabled {
|
let zttid = zttid.unwrap();
|
||||||
continue; /* just drain the channel and do nothing */
|
if conf.remote_storage.is_none() || !conf.wal_backup_enabled {
|
||||||
}
|
continue; /* just drain the channel and do nothing */
|
||||||
// do we need to do anything at all?
|
}
|
||||||
if is_wal_backup_required != tasks.contains_key(&zttid) {
|
let timeline = is_wal_backup_required(zttid);
|
||||||
if is_wal_backup_required {
|
// do we need to do anything at all?
|
||||||
// need to start the task
|
if timeline.is_some() != tasks.contains_key(&zttid) {
|
||||||
info!("starting WAL backup task for {}", zttid);
|
if let Some(timeline) = timeline {
|
||||||
|
// need to start the task
|
||||||
|
let entry = tasks.entry(zttid).or_insert(WalBackupTimelineEntry {
|
||||||
|
timeline,
|
||||||
|
handle: None,
|
||||||
|
});
|
||||||
|
consider_start_task(&conf, zttid, entry);
|
||||||
|
} else {
|
||||||
|
// need to stop the task
|
||||||
|
info!("stopping WAL backup task for {}", zttid);
|
||||||
|
|
||||||
// TODO: decide who should offload in launcher itself by simply checking current state
|
let entry = tasks.remove(&zttid).unwrap();
|
||||||
let election_name = broker::get_campaign_name(
|
if let Some(wb_handle) = entry.handle {
|
||||||
BACKUP_ELECTION_NAME,
|
// Tell the task to shutdown. Error means task exited earlier, that's ok.
|
||||||
&conf.broker_etcd_prefix,
|
let _ = wb_handle.shutdown_tx.send(()).await;
|
||||||
zttid,
|
// Await the task itself. TODO: restart panicked tasks earlier.
|
||||||
);
|
if let Err(e) = wb_handle.handle.await {
|
||||||
let my_candidate_name = broker::get_candiate_name(conf.my_id);
|
warn!("WAL backup task for {} panicked: {}", zttid, e);
|
||||||
let election = broker::Election::new(
|
}
|
||||||
election_name,
|
}
|
||||||
my_candidate_name,
|
}
|
||||||
conf.broker_endpoints.clone(),
|
}
|
||||||
);
|
}
|
||||||
|
// Start known tasks, if needed and possible.
|
||||||
let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
|
_ = ticker.tick() => {
|
||||||
let timeline_dir = conf.timeline_dir(&zttid);
|
for (zttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) {
|
||||||
|
consider_start_task(&conf, *zttid, entry);
|
||||||
let handle = tokio::spawn(
|
|
||||||
backup_task_main(zttid, timeline_dir, shutdown_rx, election)
|
|
||||||
.instrument(info_span!("WAL backup task", zttid = %zttid)),
|
|
||||||
);
|
|
||||||
|
|
||||||
tasks.insert(
|
|
||||||
zttid,
|
|
||||||
WalBackupTaskHandle {
|
|
||||||
shutdown_tx,
|
|
||||||
handle,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
// need to stop the task
|
|
||||||
info!("stopping WAL backup task for {}", zttid);
|
|
||||||
|
|
||||||
let wb_handle = tasks.remove(&zttid).unwrap();
|
|
||||||
// Tell the task to shutdown. Error means task exited earlier, that's ok.
|
|
||||||
let _ = wb_handle.shutdown_tx.send(()).await;
|
|
||||||
// Await the task itself. TODO: restart panicked tasks earlier.
|
|
||||||
// Hm, why I can't await on reference to handle?
|
|
||||||
if let Err(e) = wb_handle.handle.await {
|
|
||||||
warn!("WAL backup task for {} panicked: {}", zttid, e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -200,20 +240,11 @@ impl WalBackupTask {
|
|||||||
loop {
|
loop {
|
||||||
let mut retry_attempt = 0u32;
|
let mut retry_attempt = 0u32;
|
||||||
|
|
||||||
if let Some(l) = self.leader.take() {
|
|
||||||
l.give_up().await;
|
|
||||||
}
|
|
||||||
|
|
||||||
info!("acquiring leadership");
|
info!("acquiring leadership");
|
||||||
match broker::get_leader(&self.election).await {
|
if let Err(e) = broker::get_leader(&self.election, &mut self.leader).await {
|
||||||
Ok(l) => {
|
error!("error during leader election {:?}", e);
|
||||||
self.leader = Some(l);
|
sleep(Duration::from_millis(BROKER_CONNECTION_RETRY_DELAY_MS)).await;
|
||||||
}
|
continue;
|
||||||
Err(e) => {
|
|
||||||
error!("error during leader election {:?}", e);
|
|
||||||
sleep(Duration::from_millis(BROKER_CONNECTION_RETRY_DELAY_MS)).await;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
info!("acquired leadership");
|
info!("acquired leadership");
|
||||||
|
|
||||||
@@ -417,3 +448,49 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn read_object(
|
||||||
|
file_path: PathBuf,
|
||||||
|
offset: u64,
|
||||||
|
) -> (impl AsyncRead, JoinHandle<Result<()>>) {
|
||||||
|
let storage = REMOTE_STORAGE.get().expect("failed to get remote storage");
|
||||||
|
|
||||||
|
let (mut pipe_writer, pipe_reader) = tokio::io::duplex(MAX_SEND_SIZE);
|
||||||
|
|
||||||
|
let copy_result = tokio::spawn(async move {
|
||||||
|
let res = match storage.as_ref().unwrap() {
|
||||||
|
GenericRemoteStorage::Local(local_storage) => {
|
||||||
|
let source = local_storage.remote_object_id(&file_path)?;
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"local download about to start from {} at offset {}",
|
||||||
|
source.display(),
|
||||||
|
offset
|
||||||
|
);
|
||||||
|
local_storage
|
||||||
|
.download_byte_range(&source, offset, None, &mut pipe_writer)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
GenericRemoteStorage::S3(s3_storage) => {
|
||||||
|
let s3key = s3_storage.remote_object_id(&file_path)?;
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"S3 download about to start from {:?} at offset {}",
|
||||||
|
s3key, offset
|
||||||
|
);
|
||||||
|
s3_storage
|
||||||
|
.download_byte_range(&s3key, offset, None, &mut pipe_writer)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Err(e) = res {
|
||||||
|
error!("failed to download WAL segment from remote storage: {}", e);
|
||||||
|
Err(e)
|
||||||
|
} else {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
(pipe_reader, copy_result)
|
||||||
|
}
|
||||||
|
|||||||
@@ -8,7 +8,9 @@
|
|||||||
//! Note that last file has `.partial` suffix, that's different from postgres.
|
//! Note that last file has `.partial` suffix, that's different from postgres.
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, Context, Result};
|
use anyhow::{anyhow, bail, Context, Result};
|
||||||
use std::io::{Read, Seek, SeekFrom};
|
use std::io::{self, Seek, SeekFrom};
|
||||||
|
use std::pin::Pin;
|
||||||
|
use tokio::io::AsyncRead;
|
||||||
|
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use postgres_ffi::xlog_utils::{
|
use postgres_ffi::xlog_utils::{
|
||||||
@@ -26,6 +28,7 @@ use utils::{lsn::Lsn, zid::ZTenantTimelineId};
|
|||||||
|
|
||||||
use crate::safekeeper::SafeKeeperState;
|
use crate::safekeeper::SafeKeeperState;
|
||||||
|
|
||||||
|
use crate::wal_backup::read_object;
|
||||||
use crate::SafeKeeperConf;
|
use crate::SafeKeeperConf;
|
||||||
use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ};
|
use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ};
|
||||||
|
|
||||||
@@ -33,6 +36,8 @@ use postgres_ffi::waldecoder::WalStreamDecoder;
|
|||||||
|
|
||||||
use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS};
|
use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS};
|
||||||
|
|
||||||
|
use tokio::io::{AsyncReadExt, AsyncSeekExt};
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`).
|
// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`).
|
||||||
// i64 is faster than f64, so update to u64 when available.
|
// i64 is faster than f64, so update to u64 when available.
|
||||||
@@ -504,69 +509,123 @@ pub struct WalReader {
|
|||||||
timeline_dir: PathBuf,
|
timeline_dir: PathBuf,
|
||||||
wal_seg_size: usize,
|
wal_seg_size: usize,
|
||||||
pos: Lsn,
|
pos: Lsn,
|
||||||
file: Option<File>,
|
wal_segment: Option<Pin<Box<dyn AsyncRead>>>,
|
||||||
|
|
||||||
|
enable_remote_read: bool,
|
||||||
|
// S3 will be used to read WAL if LSN is not available locally
|
||||||
|
local_start_lsn: Lsn,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl WalReader {
|
impl WalReader {
|
||||||
pub fn new(timeline_dir: PathBuf, wal_seg_size: usize, pos: Lsn) -> Self {
|
pub fn new(
|
||||||
Self {
|
timeline_dir: PathBuf,
|
||||||
timeline_dir,
|
state: &SafeKeeperState,
|
||||||
wal_seg_size,
|
start_pos: Lsn,
|
||||||
pos,
|
enable_remote_read: bool,
|
||||||
file: None,
|
) -> Result<Self> {
|
||||||
|
if start_pos < state.timeline_start_lsn {
|
||||||
|
bail!(
|
||||||
|
"Requested streaming from {}, which is before the start of the timeline {}",
|
||||||
|
start_pos,
|
||||||
|
state.timeline_start_lsn
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: add state.timeline_start_lsn == Lsn(0) check
|
||||||
|
if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) {
|
||||||
|
bail!("state uninitialized, no data to read");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
timeline_dir,
|
||||||
|
wal_seg_size: state.server.wal_seg_size as usize,
|
||||||
|
pos: start_pos,
|
||||||
|
wal_segment: None,
|
||||||
|
enable_remote_read,
|
||||||
|
local_start_lsn: state.local_start_lsn,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
|
pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
|
||||||
// Take the `File` from `wal_file`, or open a new file.
|
let mut wal_segment = match self.wal_segment.take() {
|
||||||
let mut file = match self.file.take() {
|
Some(reader) => reader,
|
||||||
Some(file) => file,
|
None => self.open_segment().await?,
|
||||||
None => {
|
|
||||||
// Open a new file.
|
|
||||||
let segno = self.pos.segment_number(self.wal_seg_size);
|
|
||||||
let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
|
|
||||||
let wal_file_path = self.timeline_dir.join(wal_file_name);
|
|
||||||
Self::open_wal_file(&wal_file_path)?
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize;
|
|
||||||
|
|
||||||
// How much to read and send in message? We cannot cross the WAL file
|
// How much to read and send in message? We cannot cross the WAL file
|
||||||
// boundary, and we don't want send more than provided buffer.
|
// boundary, and we don't want send more than provided buffer.
|
||||||
|
let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize;
|
||||||
let send_size = min(buf.len(), self.wal_seg_size - xlogoff);
|
let send_size = min(buf.len(), self.wal_seg_size - xlogoff);
|
||||||
|
|
||||||
// Read some data from the file.
|
// Read some data from the file.
|
||||||
let buf = &mut buf[0..send_size];
|
let buf = &mut buf[0..send_size];
|
||||||
file.seek(SeekFrom::Start(xlogoff as u64))
|
let send_size = wal_segment.read_exact(buf).await?;
|
||||||
.and_then(|_| file.read_exact(buf))
|
|
||||||
.context("Failed to read data from WAL file")?;
|
|
||||||
|
|
||||||
self.pos += send_size as u64;
|
self.pos += send_size as u64;
|
||||||
|
|
||||||
// Decide whether to reuse this file. If we don't set wal_file here
|
// Decide whether to reuse this file. If we don't set wal_segment here
|
||||||
// a new file will be opened next time.
|
// a new reader will be opened next time.
|
||||||
if self.pos.segment_offset(self.wal_seg_size) != 0 {
|
if self.pos.segment_offset(self.wal_seg_size) != 0 {
|
||||||
self.file = Some(file);
|
self.wal_segment = Some(wal_segment);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(send_size)
|
Ok(send_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Open WAL segment at the current position of the reader.
|
||||||
|
async fn open_segment(&self) -> Result<Pin<Box<dyn AsyncRead>>> {
|
||||||
|
let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize;
|
||||||
|
let segno = self.pos.segment_number(self.wal_seg_size);
|
||||||
|
let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
|
||||||
|
let wal_file_path = self.timeline_dir.join(wal_file_name);
|
||||||
|
|
||||||
|
// Try to open local file, if we may have WAL locally
|
||||||
|
if self.pos >= self.local_start_lsn {
|
||||||
|
let res = Self::open_wal_file(&wal_file_path).await;
|
||||||
|
match res {
|
||||||
|
Ok(mut file) => {
|
||||||
|
file.seek(SeekFrom::Start(xlogoff as u64)).await?;
|
||||||
|
return Ok(Box::pin(file));
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
let is_not_found = e.chain().any(|e| {
|
||||||
|
if let Some(e) = e.downcast_ref::<io::Error>() {
|
||||||
|
e.kind() == io::ErrorKind::NotFound
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if !is_not_found {
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
// NotFound is expected, fall through to remote read
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to open remote file, if remote reads are enabled
|
||||||
|
if self.enable_remote_read {
|
||||||
|
let (reader, _) = read_object(wal_file_path, xlogoff as u64).await;
|
||||||
|
return Ok(Box::pin(reader));
|
||||||
|
}
|
||||||
|
|
||||||
|
bail!("WAL segment is not found")
|
||||||
|
}
|
||||||
|
|
||||||
/// Helper function for opening a wal file.
|
/// Helper function for opening a wal file.
|
||||||
fn open_wal_file(wal_file_path: &Path) -> Result<File> {
|
async fn open_wal_file(wal_file_path: &Path) -> Result<tokio::fs::File> {
|
||||||
// First try to open the .partial file.
|
// First try to open the .partial file.
|
||||||
let mut partial_path = wal_file_path.to_owned();
|
let mut partial_path = wal_file_path.to_owned();
|
||||||
partial_path.set_extension("partial");
|
partial_path.set_extension("partial");
|
||||||
if let Ok(opened_file) = File::open(&partial_path) {
|
if let Ok(opened_file) = tokio::fs::File::open(&partial_path).await {
|
||||||
return Ok(opened_file);
|
return Ok(opened_file);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If that failed, try it without the .partial extension.
|
// If that failed, try it without the .partial extension.
|
||||||
File::open(&wal_file_path)
|
tokio::fs::File::open(&wal_file_path)
|
||||||
|
.await
|
||||||
.with_context(|| format!("Failed to open WAL file {:?}", wal_file_path))
|
.with_context(|| format!("Failed to open WAL file {:?}", wal_file_path))
|
||||||
.map_err(|e| {
|
.map_err(|e| {
|
||||||
error!("{}", e);
|
warn!("{}", e);
|
||||||
e
|
e
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
222
scripts/export_import_betwen_pageservers.py
Executable file
222
scripts/export_import_betwen_pageservers.py
Executable file
@@ -0,0 +1,222 @@
|
|||||||
|
#
|
||||||
|
# Simple script to export nodes from one pageserver
|
||||||
|
# and import them into another page server
|
||||||
|
#
|
||||||
|
from os import path
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import uuid
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# directory to save exported tar files to
|
||||||
|
basepath = path.dirname(path.abspath(__file__))
|
||||||
|
|
||||||
|
|
||||||
|
class NeonPageserverApiException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class NeonPageserverHttpClient(requests.Session):
|
||||||
|
def __init__(self, host, port):
|
||||||
|
super().__init__()
|
||||||
|
self.host = host
|
||||||
|
self.port = port
|
||||||
|
|
||||||
|
def verbose_error(self, res: requests.Response):
|
||||||
|
try:
|
||||||
|
res.raise_for_status()
|
||||||
|
except requests.RequestException as e:
|
||||||
|
try:
|
||||||
|
msg = res.json()['msg']
|
||||||
|
except:
|
||||||
|
msg = ''
|
||||||
|
raise NeonPageserverApiException(msg) from e
|
||||||
|
|
||||||
|
def check_status(self):
|
||||||
|
self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status()
|
||||||
|
|
||||||
|
def tenant_list(self):
|
||||||
|
res = self.get(f"http://{self.host}:{self.port}/v1/tenant")
|
||||||
|
self.verbose_error(res)
|
||||||
|
res_json = res.json()
|
||||||
|
assert isinstance(res_json, list)
|
||||||
|
return res_json
|
||||||
|
|
||||||
|
def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
|
||||||
|
res = self.post(
|
||||||
|
f"http://{self.host}:{self.port}/v1/tenant",
|
||||||
|
json={
|
||||||
|
'new_tenant_id': new_tenant_id.hex,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if res.status_code == 409:
|
||||||
|
if ok_if_exists:
|
||||||
|
print(f'could not create tenant: already exists for id {new_tenant_id}')
|
||||||
|
else:
|
||||||
|
res.raise_for_status()
|
||||||
|
elif res.status_code == 201:
|
||||||
|
print(f'created tenant {new_tenant_id}')
|
||||||
|
else:
|
||||||
|
self.verbose_error(res)
|
||||||
|
|
||||||
|
return new_tenant_id
|
||||||
|
|
||||||
|
def timeline_list(self, tenant_id: uuid.UUID):
|
||||||
|
res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline")
|
||||||
|
self.verbose_error(res)
|
||||||
|
res_json = res.json()
|
||||||
|
assert isinstance(res_json, list)
|
||||||
|
return res_json
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: argparse.Namespace):
|
||||||
|
old_pageserver_host = args.old_pageserver_host
|
||||||
|
new_pageserver_host = args.new_pageserver_host
|
||||||
|
tenants = args.tenants
|
||||||
|
|
||||||
|
old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port)
|
||||||
|
old_http_client.check_status()
|
||||||
|
old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}"
|
||||||
|
|
||||||
|
new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port)
|
||||||
|
new_http_client.check_status()
|
||||||
|
new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}"
|
||||||
|
|
||||||
|
psql_env = {**os.environ, 'LD_LIBRARY_PATH': '/usr/local/lib/'}
|
||||||
|
|
||||||
|
for tenant_id in tenants:
|
||||||
|
print(f"Tenant: {tenant_id}")
|
||||||
|
timelines = old_http_client.timeline_list(uuid.UUID(tenant_id))
|
||||||
|
print(f"Timelines: {timelines}")
|
||||||
|
|
||||||
|
# Create tenant in new pageserver
|
||||||
|
if args.only_import is False:
|
||||||
|
new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists)
|
||||||
|
|
||||||
|
for timeline in timelines:
|
||||||
|
|
||||||
|
# Export timelines from old pageserver
|
||||||
|
if args.only_import is False:
|
||||||
|
query = f"fullbackup {timeline['tenant_id']} {timeline['timeline_id']} {timeline['local']['last_record_lsn']}"
|
||||||
|
|
||||||
|
cmd = ["psql", "--no-psqlrc", old_pageserver_connstr, "-c", query]
|
||||||
|
print(f"Running: {cmd}")
|
||||||
|
|
||||||
|
tar_filename = path.join(basepath,
|
||||||
|
f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar")
|
||||||
|
stderr_filename = path.join(
|
||||||
|
basepath, f"{timeline['tenant_id']}_{timeline['timeline_id']}.stderr")
|
||||||
|
|
||||||
|
with open(tar_filename, 'w') as stdout_f:
|
||||||
|
with open(stderr_filename, 'w') as stderr_f:
|
||||||
|
print(f"(capturing output to {tar_filename})")
|
||||||
|
subprocess.run(cmd, stdout=stdout_f, stderr=stderr_f, env=psql_env)
|
||||||
|
|
||||||
|
print(f"Done export: {tar_filename}")
|
||||||
|
|
||||||
|
# Import timelines to new pageserver
|
||||||
|
psql_path = Path(args.psql_path)
|
||||||
|
import_cmd = f"import basebackup {timeline['tenant_id']} {timeline['timeline_id']} {timeline['local']['last_record_lsn']} {timeline['local']['last_record_lsn']}"
|
||||||
|
tar_filename = path.join(basepath,
|
||||||
|
f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar")
|
||||||
|
full_cmd = rf"""cat {tar_filename} | {psql_path} {new_pageserver_connstr} -c '{import_cmd}' """
|
||||||
|
|
||||||
|
stderr_filename2 = path.join(
|
||||||
|
basepath, f"import_{timeline['tenant_id']}_{timeline['timeline_id']}.stderr")
|
||||||
|
stdout_filename = path.join(
|
||||||
|
basepath, f"import_{timeline['tenant_id']}_{timeline['timeline_id']}.stdout")
|
||||||
|
|
||||||
|
print(f"Running: {full_cmd}")
|
||||||
|
|
||||||
|
with open(stdout_filename, 'w') as stdout_f:
|
||||||
|
with open(stderr_filename2, 'w') as stderr_f:
|
||||||
|
print(f"(capturing output to {stdout_filename})")
|
||||||
|
subprocess.run(full_cmd,
|
||||||
|
stdout=stdout_f,
|
||||||
|
stderr=stderr_f,
|
||||||
|
env=psql_env,
|
||||||
|
shell=True)
|
||||||
|
|
||||||
|
print(f"Done import")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
'--tenant-id',
|
||||||
|
dest='tenants',
|
||||||
|
required=True,
|
||||||
|
nargs='+',
|
||||||
|
help='Id of the tenant to migrate. You can pass multiple arguments',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--from-host',
|
||||||
|
dest='old_pageserver_host',
|
||||||
|
required=True,
|
||||||
|
help='Host of the pageserver to migrate data from',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--from-http-port',
|
||||||
|
dest='old_pageserver_http_port',
|
||||||
|
required=False,
|
||||||
|
type=int,
|
||||||
|
default=9898,
|
||||||
|
help='HTTP port of the pageserver to migrate data from. Default: 9898',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--from-pg-port',
|
||||||
|
dest='old_pageserver_pg_port',
|
||||||
|
required=False,
|
||||||
|
type=int,
|
||||||
|
default=6400,
|
||||||
|
help='pg port of the pageserver to migrate data from. Default: 6400',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--to-host',
|
||||||
|
dest='new_pageserver_host',
|
||||||
|
required=True,
|
||||||
|
help='Host of the pageserver to migrate data to',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--to-http-port',
|
||||||
|
dest='new_pageserver_http_port',
|
||||||
|
required=False,
|
||||||
|
default=9898,
|
||||||
|
type=int,
|
||||||
|
help='HTTP port of the pageserver to migrate data to. Default: 9898',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--to-pg-port',
|
||||||
|
dest='new_pageserver_pg_port',
|
||||||
|
required=False,
|
||||||
|
default=6400,
|
||||||
|
type=int,
|
||||||
|
help='pg port of the pageserver to migrate data to. Default: 6400',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--ignore-tenant-exists',
|
||||||
|
dest='ok_if_exists',
|
||||||
|
required=False,
|
||||||
|
help=
|
||||||
|
'Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--psql-path',
|
||||||
|
dest='psql_path',
|
||||||
|
required=False,
|
||||||
|
default='/usr/local/bin/psql',
|
||||||
|
help='Path to the psql binary. Default: /usr/local/bin/psql',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--only-import',
|
||||||
|
dest='only_import',
|
||||||
|
required=False,
|
||||||
|
default=False,
|
||||||
|
action='store_true',
|
||||||
|
help='Skip export and tenant creation part',
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
@@ -26,6 +26,7 @@ KEY_EXCLUDE_FIELDS = frozenset({
|
|||||||
})
|
})
|
||||||
NEGATIVE_COLOR = 'negative'
|
NEGATIVE_COLOR = 'negative'
|
||||||
POSITIVE_COLOR = 'positive'
|
POSITIVE_COLOR = 'positive'
|
||||||
|
EPS = 1e-6
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -120,7 +121,8 @@ def get_row_values(columns: List[str], run_result: SuitRun,
|
|||||||
# this might happen when new metric is added and there is no value for it in previous run
|
# this might happen when new metric is added and there is no value for it in previous run
|
||||||
# let this be here, TODO add proper handling when this actually happens
|
# let this be here, TODO add proper handling when this actually happens
|
||||||
raise ValueError(f'{column} not found in previous result')
|
raise ValueError(f'{column} not found in previous result')
|
||||||
ratio = float(value) / float(prev_value['value']) - 1
|
# adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero
|
||||||
|
ratio = (float(value) + EPS) / (float(prev_value['value']) + EPS) - 1
|
||||||
ratio_display, color = format_ratio(ratio, current_value['report'])
|
ratio_display, color = format_ratio(ratio, current_value['report'])
|
||||||
row_values.append(RowValue(value, color, ratio_display))
|
row_values.append(RowValue(value, color, ratio_display))
|
||||||
return row_values
|
return row_values
|
||||||
|
|||||||
@@ -28,6 +28,10 @@ strict = true
|
|||||||
# There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577
|
# There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577
|
||||||
ignore_missing_imports = true
|
ignore_missing_imports = true
|
||||||
|
|
||||||
|
[mypy-pg8000.*]
|
||||||
|
# Used only in testing clients
|
||||||
|
ignore_missing_imports = true
|
||||||
|
|
||||||
[mypy-cached_property.*]
|
[mypy-cached_property.*]
|
||||||
ignore_missing_imports = true
|
ignore_missing_imports = true
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ If you want to run all tests that have the string "bench" in their names:
|
|||||||
|
|
||||||
Useful environment variables:
|
Useful environment variables:
|
||||||
|
|
||||||
`ZENITH_BIN`: The directory where zenith binaries can be found.
|
`NEON_BIN`: The directory where neon binaries can be found.
|
||||||
`POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found.
|
`POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found.
|
||||||
`TEST_OUTPUT`: Set the directory where test state and test output files
|
`TEST_OUTPUT`: Set the directory where test state and test output files
|
||||||
should go.
|
should go.
|
||||||
|
|||||||
@@ -1,6 +1,3 @@
|
|||||||
from contextlib import closing
|
|
||||||
|
|
||||||
import psycopg2.extras
|
|
||||||
import pytest
|
import pytest
|
||||||
from fixtures.log_helper import log
|
from fixtures.log_helper import log
|
||||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException
|
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException
|
||||||
|
|||||||
@@ -1,13 +1,11 @@
|
|||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
from typing import Iterator
|
from uuid import uuid4
|
||||||
from uuid import UUID, uuid4
|
|
||||||
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException
|
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException
|
||||||
from requests.exceptions import HTTPError
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
|
def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
|
||||||
neon_env_builder.pageserver_auth_enabled = True
|
neon_env_builder.auth_enabled = True
|
||||||
env = neon_env_builder.init_start()
|
env = neon_env_builder.init_start()
|
||||||
|
|
||||||
ps = env.pageserver
|
ps = env.pageserver
|
||||||
@@ -54,7 +52,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
|
|||||||
|
|
||||||
@pytest.mark.parametrize('with_safekeepers', [False, True])
|
@pytest.mark.parametrize('with_safekeepers', [False, True])
|
||||||
def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool):
|
def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool):
|
||||||
neon_env_builder.pageserver_auth_enabled = True
|
neon_env_builder.auth_enabled = True
|
||||||
if with_safekeepers:
|
if with_safekeepers:
|
||||||
neon_env_builder.num_safekeepers = 3
|
neon_env_builder.num_safekeepers = 3
|
||||||
env = neon_env_builder.init_start()
|
env = neon_env_builder.init_start()
|
||||||
|
|||||||
@@ -1,11 +1,9 @@
|
|||||||
from contextlib import closing, contextmanager
|
from contextlib import closing, contextmanager
|
||||||
import psycopg2.extras
|
import psycopg2.extras
|
||||||
import pytest
|
import pytest
|
||||||
from fixtures.neon_fixtures import PgProtocol, NeonEnvBuilder
|
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||||
from fixtures.log_helper import log
|
from fixtures.log_helper import log
|
||||||
import os
|
|
||||||
import time
|
import time
|
||||||
import asyncpg
|
|
||||||
from fixtures.neon_fixtures import Postgres
|
from fixtures.neon_fixtures import Postgres
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
from contextlib import closing
|
|
||||||
|
|
||||||
from fixtures.neon_fixtures import NeonEnv
|
from fixtures.neon_fixtures import NeonEnv
|
||||||
from fixtures.log_helper import log
|
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import subprocess
|
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
import psycopg2.extras
|
import psycopg2.extras
|
||||||
|
|||||||
@@ -35,9 +35,14 @@ def test_createdb(neon_simple_env: NeonEnv):
|
|||||||
with closing(db.connect(dbname='foodb')) as conn:
|
with closing(db.connect(dbname='foodb')) as conn:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
# Check database size in both branches
|
# Check database size in both branches
|
||||||
cur.execute(
|
cur.execute("""
|
||||||
'select pg_size_pretty(pg_database_size(%s)), pg_size_pretty(sum(pg_relation_size(oid))) from pg_class where relisshared is false;',
|
select pg_size_pretty(pg_database_size('foodb')),
|
||||||
('foodb', ))
|
pg_size_pretty(
|
||||||
|
sum(pg_relation_size(oid, 'main'))
|
||||||
|
+sum(pg_relation_size(oid, 'vm'))
|
||||||
|
+sum(pg_relation_size(oid, 'fsm'))
|
||||||
|
) FROM pg_class where relisshared is false
|
||||||
|
""")
|
||||||
res = cur.fetchone()
|
res = cur.fetchone()
|
||||||
# check that dbsize equals sum of all relation sizes, excluding shared ones
|
# check that dbsize equals sum of all relation sizes, excluding shared ones
|
||||||
# This is how we define dbsize in neon for now
|
# This is how we define dbsize in neon for now
|
||||||
|
|||||||
68
test_runner/batch_others/test_fullbackup.py
Normal file
68
test_runner/batch_others/test_fullbackup.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
from contextlib import closing
|
||||||
|
|
||||||
|
from fixtures.log_helper import log
|
||||||
|
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres
|
||||||
|
from fixtures.neon_fixtures import pg_distrib_dir
|
||||||
|
import os
|
||||||
|
from fixtures.utils import subprocess_capture
|
||||||
|
|
||||||
|
num_rows = 1000
|
||||||
|
|
||||||
|
|
||||||
|
# Ensure that regular postgres can start from fullbackup
|
||||||
|
def test_fullbackup(neon_env_builder: NeonEnvBuilder,
|
||||||
|
pg_bin: PgBin,
|
||||||
|
port_distributor: PortDistributor):
|
||||||
|
|
||||||
|
neon_env_builder.num_safekeepers = 1
|
||||||
|
env = neon_env_builder.init_start()
|
||||||
|
|
||||||
|
env.neon_cli.create_branch('test_fullbackup')
|
||||||
|
pgmain = env.postgres.create_start('test_fullbackup')
|
||||||
|
log.info("postgres is running on 'test_fullbackup' branch")
|
||||||
|
|
||||||
|
timeline = pgmain.safe_psql("SHOW neon.timeline_id")[0][0]
|
||||||
|
|
||||||
|
with closing(pgmain.connect()) as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
# data loading may take a while, so increase statement timeout
|
||||||
|
cur.execute("SET statement_timeout='300s'")
|
||||||
|
cur.execute(f'''CREATE TABLE tbl AS SELECT 'long string to consume some space' || g
|
||||||
|
from generate_series(1,{num_rows}) g''')
|
||||||
|
cur.execute("CHECKPOINT")
|
||||||
|
|
||||||
|
cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||||
|
lsn = cur.fetchone()[0]
|
||||||
|
log.info(f"start_backup_lsn = {lsn}")
|
||||||
|
|
||||||
|
# Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
|
||||||
|
# PgBin sets it automatically, but here we need to pipe psql output to the tar command.
|
||||||
|
psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')}
|
||||||
|
|
||||||
|
# Get and unpack fullbackup from pageserver
|
||||||
|
restored_dir_path = env.repo_dir / "restored_datadir"
|
||||||
|
os.mkdir(restored_dir_path, 0o750)
|
||||||
|
query = f"fullbackup {env.initial_tenant.hex} {timeline} {lsn}"
|
||||||
|
cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
|
||||||
|
result_basepath = pg_bin.run_capture(cmd, env=psql_env)
|
||||||
|
tar_output_file = result_basepath + ".stdout"
|
||||||
|
subprocess_capture(str(env.repo_dir),
|
||||||
|
["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)])
|
||||||
|
|
||||||
|
# HACK
|
||||||
|
# fullbackup returns neon specific pg_control and first WAL segment
|
||||||
|
# use resetwal to overwrite it
|
||||||
|
pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, 'pg_resetwal')
|
||||||
|
cmd = [pg_resetwal_path, "-D", str(restored_dir_path)]
|
||||||
|
pg_bin.run_capture(cmd, env=psql_env)
|
||||||
|
|
||||||
|
# Restore from the backup and find the data we inserted
|
||||||
|
port = port_distributor.get_port()
|
||||||
|
with VanillaPostgres(restored_dir_path, pg_bin, port, init=False) as vanilla_pg:
|
||||||
|
# TODO make port an optional argument
|
||||||
|
vanilla_pg.configure([
|
||||||
|
f"port={port}",
|
||||||
|
])
|
||||||
|
vanilla_pg.start()
|
||||||
|
num_rows_found = vanilla_pg.safe_psql('select count(*) from tbl;', user="cloud_admin")[0][0]
|
||||||
|
assert num_rows == num_rows_found
|
||||||
198
test_runner/batch_others/test_import.py
Normal file
198
test_runner/batch_others/test_import.py
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
import pytest
|
||||||
|
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_upload, wait_for_last_record_lsn
|
||||||
|
from fixtures.utils import lsn_from_hex, lsn_to_hex
|
||||||
|
from uuid import UUID, uuid4
|
||||||
|
import tarfile
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
from fixtures.utils import subprocess_capture
|
||||||
|
from fixtures.log_helper import log
|
||||||
|
from contextlib import closing
|
||||||
|
from fixtures.neon_fixtures import pg_distrib_dir
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.timeout(600)
|
||||||
|
def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
|
||||||
|
# Put data in vanilla pg
|
||||||
|
vanilla_pg.start()
|
||||||
|
vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
|
||||||
|
vanilla_pg.safe_psql('''create table t as select 'long string to consume some space' || g
|
||||||
|
from generate_series(1,300000) g''')
|
||||||
|
assert vanilla_pg.safe_psql('select count(*) from t') == [(300000, )]
|
||||||
|
|
||||||
|
# Take basebackup
|
||||||
|
basebackup_dir = os.path.join(test_output_dir, "basebackup")
|
||||||
|
base_tar = os.path.join(basebackup_dir, "base.tar")
|
||||||
|
wal_tar = os.path.join(basebackup_dir, "pg_wal.tar")
|
||||||
|
os.mkdir(basebackup_dir)
|
||||||
|
vanilla_pg.safe_psql("CHECKPOINT")
|
||||||
|
pg_bin.run([
|
||||||
|
"pg_basebackup",
|
||||||
|
"-F",
|
||||||
|
"tar",
|
||||||
|
"-d",
|
||||||
|
vanilla_pg.connstr(),
|
||||||
|
"-D",
|
||||||
|
basebackup_dir,
|
||||||
|
])
|
||||||
|
|
||||||
|
# Make corrupt base tar with missing pg_control
|
||||||
|
unpacked_base = os.path.join(basebackup_dir, "unpacked-base")
|
||||||
|
corrupt_base_tar = os.path.join(unpacked_base, "corrupt-base.tar")
|
||||||
|
os.mkdir(unpacked_base, 0o750)
|
||||||
|
subprocess_capture(str(test_output_dir), ["tar", "-xf", base_tar, "-C", unpacked_base])
|
||||||
|
os.remove(os.path.join(unpacked_base, "global/pg_control"))
|
||||||
|
subprocess_capture(str(test_output_dir),
|
||||||
|
["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base),
|
||||||
|
cwd=unpacked_base)
|
||||||
|
|
||||||
|
# Get start_lsn and end_lsn
|
||||||
|
with open(os.path.join(basebackup_dir, "backup_manifest")) as f:
|
||||||
|
manifest = json.load(f)
|
||||||
|
start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"]
|
||||||
|
end_lsn = manifest["WAL-Ranges"][0]["End-LSN"]
|
||||||
|
|
||||||
|
node_name = "import_from_vanilla"
|
||||||
|
tenant = uuid4()
|
||||||
|
timeline = uuid4()
|
||||||
|
|
||||||
|
# Set up pageserver for import
|
||||||
|
neon_env_builder.enable_local_fs_remote_storage()
|
||||||
|
env = neon_env_builder.init_start()
|
||||||
|
env.pageserver.http_client().tenant_create(tenant)
|
||||||
|
|
||||||
|
def import_tar(base, wal):
|
||||||
|
env.neon_cli.raw_cli([
|
||||||
|
"timeline",
|
||||||
|
"import",
|
||||||
|
"--tenant-id",
|
||||||
|
tenant.hex,
|
||||||
|
"--timeline-id",
|
||||||
|
timeline.hex,
|
||||||
|
"--node-name",
|
||||||
|
node_name,
|
||||||
|
"--base-lsn",
|
||||||
|
start_lsn,
|
||||||
|
"--base-tarfile",
|
||||||
|
base,
|
||||||
|
"--end-lsn",
|
||||||
|
end_lsn,
|
||||||
|
"--wal-tarfile",
|
||||||
|
wal,
|
||||||
|
])
|
||||||
|
|
||||||
|
# Importing corrupt backup fails
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
import_tar(corrupt_base_tar, wal_tar)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
# TODO it should clean itself
|
||||||
|
client = env.pageserver.http_client()
|
||||||
|
client.timeline_detach(tenant, timeline)
|
||||||
|
|
||||||
|
# Importing correct backup works
|
||||||
|
import_tar(base_tar, wal_tar)
|
||||||
|
|
||||||
|
# Wait for data to land in s3
|
||||||
|
wait_for_last_record_lsn(client, tenant, timeline, lsn_from_hex(end_lsn))
|
||||||
|
wait_for_upload(client, tenant, timeline, lsn_from_hex(end_lsn))
|
||||||
|
|
||||||
|
# Check it worked
|
||||||
|
pg = env.postgres.create_start(node_name, tenant_id=tenant)
|
||||||
|
assert pg.safe_psql('select count(*) from t') == [(300000, )]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.timeout(600)
|
||||||
|
def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
|
||||||
|
|
||||||
|
num_rows = 3000
|
||||||
|
neon_env_builder.num_safekeepers = 1
|
||||||
|
neon_env_builder.enable_local_fs_remote_storage()
|
||||||
|
env = neon_env_builder.init_start()
|
||||||
|
|
||||||
|
env.neon_cli.create_branch('test_import_from_pageserver')
|
||||||
|
pgmain = env.postgres.create_start('test_import_from_pageserver')
|
||||||
|
log.info("postgres is running on 'test_import_from_pageserver' branch")
|
||||||
|
|
||||||
|
timeline = pgmain.safe_psql("SHOW neon.timeline_id")[0][0]
|
||||||
|
|
||||||
|
with closing(pgmain.connect()) as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
# data loading may take a while, so increase statement timeout
|
||||||
|
cur.execute("SET statement_timeout='300s'")
|
||||||
|
cur.execute(f'''CREATE TABLE tbl AS SELECT 'long string to consume some space' || g
|
||||||
|
from generate_series(1,{num_rows}) g''')
|
||||||
|
cur.execute("CHECKPOINT")
|
||||||
|
|
||||||
|
cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||||
|
lsn = cur.fetchone()[0]
|
||||||
|
log.info(f"start_backup_lsn = {lsn}")
|
||||||
|
|
||||||
|
# Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
|
||||||
|
# PgBin sets it automatically, but here we need to pipe psql output to the tar command.
|
||||||
|
psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')}
|
||||||
|
|
||||||
|
# Get a fullbackup from pageserver
|
||||||
|
query = f"fullbackup { env.initial_tenant.hex} {timeline} {lsn}"
|
||||||
|
cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
|
||||||
|
result_basepath = pg_bin.run_capture(cmd, env=psql_env)
|
||||||
|
tar_output_file = result_basepath + ".stdout"
|
||||||
|
|
||||||
|
# Stop the first pageserver instance, erase all its data
|
||||||
|
env.postgres.stop_all()
|
||||||
|
env.pageserver.stop()
|
||||||
|
|
||||||
|
dir_to_clear = Path(env.repo_dir) / 'tenants'
|
||||||
|
shutil.rmtree(dir_to_clear)
|
||||||
|
os.mkdir(dir_to_clear)
|
||||||
|
|
||||||
|
#start the pageserver again
|
||||||
|
env.pageserver.start()
|
||||||
|
|
||||||
|
# Import using another tenantid, because we use the same pageserver.
|
||||||
|
# TODO Create another pageserver to maeke test more realistic.
|
||||||
|
tenant = uuid4()
|
||||||
|
|
||||||
|
# Import to pageserver
|
||||||
|
node_name = "import_from_pageserver"
|
||||||
|
client = env.pageserver.http_client()
|
||||||
|
client.tenant_create(tenant)
|
||||||
|
env.neon_cli.raw_cli([
|
||||||
|
"timeline",
|
||||||
|
"import",
|
||||||
|
"--tenant-id",
|
||||||
|
tenant.hex,
|
||||||
|
"--timeline-id",
|
||||||
|
timeline,
|
||||||
|
"--node-name",
|
||||||
|
node_name,
|
||||||
|
"--base-lsn",
|
||||||
|
lsn,
|
||||||
|
"--base-tarfile",
|
||||||
|
os.path.join(tar_output_file),
|
||||||
|
])
|
||||||
|
|
||||||
|
# Wait for data to land in s3
|
||||||
|
wait_for_last_record_lsn(client, tenant, UUID(timeline), lsn_from_hex(lsn))
|
||||||
|
wait_for_upload(client, tenant, UUID(timeline), lsn_from_hex(lsn))
|
||||||
|
|
||||||
|
# Check it worked
|
||||||
|
pg = env.postgres.create_start(node_name, tenant_id=tenant)
|
||||||
|
assert pg.safe_psql('select count(*) from tbl') == [(num_rows, )]
|
||||||
|
|
||||||
|
# Take another fullbackup
|
||||||
|
query = f"fullbackup { tenant.hex} {timeline} {lsn}"
|
||||||
|
cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
|
||||||
|
result_basepath = pg_bin.run_capture(cmd, env=psql_env)
|
||||||
|
new_tar_output_file = result_basepath + ".stdout"
|
||||||
|
|
||||||
|
# Check it's the same as the first fullbackup
|
||||||
|
# TODO pageserver should be checking checksum
|
||||||
|
assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file)
|
||||||
|
|
||||||
|
# Check that gc works
|
||||||
|
psconn = env.pageserver.connect()
|
||||||
|
pscur = psconn.cursor()
|
||||||
|
pscur.execute(f"do_gc {tenant.hex} {timeline} 0")
|
||||||
@@ -42,8 +42,8 @@ def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_s
|
|||||||
Repeat check for several tenants/timelines.
|
Repeat check for several tenants/timelines.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
env = neon_env_builder.init_start()
|
|
||||||
neon_env_builder.num_safekeepers = num_safekeepers
|
neon_env_builder.num_safekeepers = num_safekeepers
|
||||||
|
env = neon_env_builder.init_start()
|
||||||
pageserver_http = env.pageserver.http_client()
|
pageserver_http = env.pageserver.http_client()
|
||||||
|
|
||||||
for _ in range(num_timelines):
|
for _ in range(num_timelines):
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
|
|||||||
|
|
||||||
|
|
||||||
def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder):
|
def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder):
|
||||||
neon_env_builder.pageserver_auth_enabled = True
|
neon_env_builder.auth_enabled = True
|
||||||
env = neon_env_builder.init_start()
|
env = neon_env_builder.init_start()
|
||||||
|
|
||||||
management_token = env.auth_keys.generate_management_token()
|
management_token = env.auth_keys.generate_management_token()
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import pytest
|
|||||||
|
|
||||||
|
|
||||||
def test_proxy_select_1(static_proxy):
|
def test_proxy_select_1(static_proxy):
|
||||||
static_proxy.safe_psql("select 1;")
|
static_proxy.safe_psql("select 1;", options="project=generic-project-name")
|
||||||
|
|
||||||
|
|
||||||
# Pass extra options to the server.
|
# Pass extra options to the server.
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
# It's possible to run any regular test with the local fs remote storage via
|
# It's possible to run any regular test with the local fs remote storage via
|
||||||
# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/zenith_zzz/'}" poetry ......
|
# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
|
||||||
|
|
||||||
import shutil, os
|
import shutil, os
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from fixtures.log_helper import log
|
|||||||
#
|
#
|
||||||
@pytest.mark.parametrize('with_safekeepers', [False, True])
|
@pytest.mark.parametrize('with_safekeepers', [False, True])
|
||||||
def test_restart_compute(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool):
|
def test_restart_compute(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool):
|
||||||
neon_env_builder.pageserver_auth_enabled = True
|
neon_env_builder.auth_enabled = True
|
||||||
if with_safekeepers:
|
if with_safekeepers:
|
||||||
neon_env_builder.num_safekeepers = 3
|
neon_env_builder.num_safekeepers = 3
|
||||||
env = neon_env_builder.init_start()
|
env = neon_env_builder.init_start()
|
||||||
|
|||||||
70
test_runner/batch_others/test_tenant_tasks.py
Normal file
70
test_runner/batch_others/test_tenant_tasks.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
from fixtures.neon_fixtures import NeonEnvBuilder, wait_until
|
||||||
|
from uuid import UUID
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
def get_only_element(l):
|
||||||
|
assert len(l) == 1
|
||||||
|
return l[0]
|
||||||
|
|
||||||
|
|
||||||
|
# Test that gc and compaction tenant tasks start and stop correctly
|
||||||
|
def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
|
||||||
|
# The gc and compaction loops don't bother to watch for tenant state
|
||||||
|
# changes while sleeping, so we use small periods to make this test
|
||||||
|
# run faster. With default settings we'd have to wait longer for tasks
|
||||||
|
# to notice state changes and shut down.
|
||||||
|
# TODO fix this behavior in the pageserver
|
||||||
|
tenant_config = "{gc_period = '1 s', compaction_period = '1 s'}"
|
||||||
|
neon_env_builder.pageserver_config_override = f"tenant_config={tenant_config}"
|
||||||
|
name = "test_tenant_tasks"
|
||||||
|
env = neon_env_builder.init_start()
|
||||||
|
client = env.pageserver.http_client()
|
||||||
|
|
||||||
|
def get_state(tenant):
|
||||||
|
all_states = client.tenant_list()
|
||||||
|
matching = [t for t in all_states if t["id"] == tenant.hex]
|
||||||
|
return get_only_element(matching)["state"]
|
||||||
|
|
||||||
|
def get_metric_value(name):
|
||||||
|
metrics = client.get_metrics()
|
||||||
|
relevant = [line for line in metrics.splitlines() if line.startswith(name)]
|
||||||
|
if len(relevant) == 0:
|
||||||
|
return 0
|
||||||
|
line = get_only_element(relevant)
|
||||||
|
value = line.lstrip(name).strip()
|
||||||
|
return int(value)
|
||||||
|
|
||||||
|
def detach_all_timelines(tenant):
|
||||||
|
timelines = [UUID(t["timeline_id"]) for t in client.timeline_list(tenant)]
|
||||||
|
for t in timelines:
|
||||||
|
client.timeline_detach(tenant, t)
|
||||||
|
|
||||||
|
def assert_idle(tenant):
|
||||||
|
assert get_state(tenant) == "Idle"
|
||||||
|
|
||||||
|
# Create tenant, start compute
|
||||||
|
tenant, _ = env.neon_cli.create_tenant()
|
||||||
|
timeline = env.neon_cli.create_timeline(name, tenant_id=tenant)
|
||||||
|
pg = env.postgres.create_start(name, tenant_id=tenant)
|
||||||
|
assert (get_state(tenant) == "Active")
|
||||||
|
|
||||||
|
# Stop compute
|
||||||
|
pg.stop()
|
||||||
|
|
||||||
|
# Detach all tenants and wait for them to go idle
|
||||||
|
# TODO they should be already idle since there are no active computes
|
||||||
|
for tenant_info in client.tenant_list():
|
||||||
|
tenant_id = UUID(tenant_info["id"])
|
||||||
|
detach_all_timelines(tenant_id)
|
||||||
|
wait_until(10, 0.2, lambda: assert_idle(tenant_id))
|
||||||
|
|
||||||
|
# Assert that all tasks finish quickly after tenants go idle
|
||||||
|
def assert_tasks_finish():
|
||||||
|
tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}')
|
||||||
|
tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}')
|
||||||
|
tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}')
|
||||||
|
assert tasks_started == tasks_ended
|
||||||
|
assert tasks_panicked == 0
|
||||||
|
|
||||||
|
wait_until(10, 0.2, assert_tasks_finish)
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user