Compare commits

..

67 Commits

Author SHA1 Message Date
Bojan Serafimov
64fcf4f096 Implement mock console 2022-02-09 14:30:01 -05:00
Dmitry Ivanov
18d3d078ad [WIP] [proxy] Migrate to async 2022-02-08 05:43:32 +03:00
Andrey Taranik
d69b0539ba proxy chart staging values update for labels (#1202) 2022-02-01 13:31:05 +03:00
Dmitry Ivanov
ec78babad2 Use mold instead of default linker 2022-01-28 20:40:50 +03:00
Dmitry Ivanov
9350dfb215 [CI] Merge *.profraw files prior to uploading workspace
Hopefully, this will make CI pipeline a bit faster.
2022-01-28 19:56:28 +03:00
Dmitry Ivanov
8ac8be5206 [scripts/coverage] Implement merge command
This will drastically decrease the size of CI workspace uploads.
2022-01-28 19:56:28 +03:00
Dmitry Ivanov
c2927353a5 Enable async deserialization of FeMessage
Now it's possible to call Fe{Startup,}Message in both
sync and async contexts, which is good for proxy.

Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
2022-01-28 19:40:37 +03:00
Kirill Bulatov
33251a9d8f Disable failing remote storage tests for now 2022-01-28 18:35:46 +03:00
Konstantin Knizhnik
c045ae7a9b Fix random range for keys in test_gc_aggressive.py (#1199) 2022-01-28 16:29:55 +03:00
Dmitry Rodionov
602ccb7d5f distinguish failures for pre-initdb lsn and pre-ancestor lsn branching in test_branch_behind 2022-01-28 12:31:15 +03:00
Dmitry Rodionov
5df21e1058 remove Timeline::start_lsn in favor of ancestor_lsn 2022-01-28 12:31:15 +03:00
Konstantin Knizhnik
08135910a5 Fix checkpoint.nextXid update (#1166)
* Fix checkpoint.nextXid update

* Add test for cehckpoint.nextXid

* Fix indentation of test_next_xid.py

* Fix mypy error in test_next_xid.py

* Tidy up the test case.

* Add a unit test

Co-authored-by: Heikki Linnakangas <heikki@zenith.tech>
2022-01-27 18:21:51 +03:00
Konstantin Knizhnik
f58a22d07e Freeze layers at the same end LSN (#1182)
* Freeze vectors at the same end LSN

* Fix calculation of last LSN for inmem layer

* Do not advance disk_consistent_lsn is no open layer was evicted

* Fix calculation of freeze_end_lsn

* Let start_lsn be larger than oldest_pending_lsn

* Rename 'oldest_pending_lsn' and 'last_lsn', add comments.

* Fix future_layerfiles test

* Update comments conserning olest_lsn

* Update comments conserning olest_lsn

Co-authored-by: Heikki Linnakangas <heikki@zenith.tech>
2022-01-27 18:21:00 +03:00
Arthur Petukhovsky
cedde559b8 Add test for replacement of the failed safekeeper (#1179)
* Add test to replace failed safekeeper

* Restart safekeepers in test_replace_safekeeper

* Update vendor/postgres
2022-01-27 17:26:55 +03:00
Arthur Petukhovsky
49d1d1ddf9 Don't call adjust_for_wal_acceptors after pg create (#1178)
Now zenith_cli handles wal_acceptors config internally, and if we
will append wal_acceptors to postgresql.conf in python tests, then
it will contain duplicate wal_acceptors config.
2022-01-27 17:23:14 +03:00
Arseny Sher
86045ac36c Prefix per-cluster directory with ztenant_id in safekeeper.
Currently ztimelineids are unique, but all APIs accept the pair, so let's keep
it everywhere for uniformity.

Carry around ZTTId containing both ZTenantId and ZTimelineId for simplicity.

(existing clusters on staging ought to be preprocessed for that)
2022-01-27 17:22:07 +03:00
Konstantin Knizhnik
79f0e44a20 Gc cutoff rwlock (#1139)
* Reproduce github issue #1047.

* Use RwLock to protect gc_cuttof_lsn

* Eeduce number of updates in test_gc_aggressive

* Change  test_prohibit_get_page_at_lsn_for_garbage_collected_pages test

* Change  test_prohibit_get_page_at_lsn_for_garbage_collected_pages

* Lock latest_gc_cutoff_lsn in all operations accessing storage to prevent race conditions with GC

* Remove random sleep between wait_for_lsn and get_page_at_lsn

* Initialize latest_gc_cutoff with initdb_lsn and remove separate check that lsn >= initdb_lsn

* Update test_prohibit_branch_creation_on_pre_initdb_lsn test

Co-authored-by: Heikki Linnakangas <heikki@zenith.tech>
2022-01-27 14:41:16 +03:00
anastasia
c44695f34b bump vendor/postgres 2022-01-27 11:20:45 +03:00
anastasia
5abe2129c6 Extend replication protocol with ZentihFeedback message
to pass current_timeline_size to compute node

Put standby_status_update fields into ZenithFeedback and send them as one message.
Pass values sizes together with keys in ZenithFeedback message.
2022-01-27 11:20:45 +03:00
Dmitry Rodionov
63dd7bce7e bandaid to avoid concurrent timeline downloading until proper refactoring/fix 2022-01-26 19:54:09 +03:00
Dmitry Rodionov
f3c73f5797 cache python deps in circle ci 2022-01-26 13:01:12 +03:00
Dmitry Rodionov
e6f2d70517 use 2021 rust edition 2022-01-25 18:48:49 +03:00
Andrey Taranik
be6d1cc360 Use zimg as builders (#1165)
* try use own builder images

* add postgres headers before build zenith

* checkout submodule before zenith build

* circleci cleanup
2022-01-25 00:58:37 +03:00
Dmitry Ivanov
703716228e Use &str instead of String in BeMessage::ErrorResponse
There's no need in allocating string literals in the heap.
2022-01-24 18:49:05 +03:00
Dmitry Rodionov
458bc0c838 walkeeper: use named type as a key in callmemaybe subscriptions hashmap 2022-01-24 17:20:15 +03:00
Dmitry Rodionov
39591ef627 reduce flakiness 2022-01-24 17:20:15 +03:00
Dmitry Rodionov
37c440c5d3 Introduce first version of tenant migraiton between pageservers
This patch includes attach/detach http endpoints in pageservers. Some
changes in callmemaybe handling inside safekeeper and an integrational
test to check migration with and without load. There are still some
rough edges that will be addressed in follow up patches
2022-01-24 17:20:15 +03:00
anastasia
81e94d1897 Add LSN and Backpressure descriptions to glossary.md 2022-01-24 12:52:30 +03:00
Konstantin Knizhnik
7bc1274a03 Fix comparison with disk_consistent_lsn in newer_image_layer_exists (#1167) 2022-01-24 12:19:18 +03:00
Dmitry Rodionov
5f5a11525c Switch our python package management solution to poetry.
Mainly because it has better support for installing the packages from
different python versions.

It also has better dependency resolver than Pipenv. And supports modern
standard for python dependency management. This includes usage of
pyproject.toml for project specific configuration instead of per
tool conf files. See following links for details:
 https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/
 https://www.python.org/dev/peps/pep-0518/
2022-01-24 11:33:47 +03:00
Konstantin Knizhnik
e209764877 Do not delete layers beyand cutoff LSN (#1128)
* Do not delete layers beyand cutoff LSN

* Update pageserver/src/layered_repository/layer_map.rs

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>
2022-01-24 10:42:40 +03:00
Kirill Bulatov
65290b2e96 Ensure every submodule compiles on its own 2022-01-21 17:34:15 +03:00
Dmitry Ivanov
127df96635 [proxy] Make NUM_BYTES_PROXIED_COUNTER more precise 2022-01-21 17:31:19 +03:00
Kirill Bulatov
924d8d489a Allow enabling S3 mock in all existing tests with an env var 2022-01-20 18:42:47 +02:00
Dmitry Rodionov
026eb64a83 Use python lib to mock s3 2022-01-20 18:42:47 +02:00
Kirill Bulatov
45124856b1 Better S3 remote storage logging 2022-01-20 18:42:47 +02:00
Kirill Bulatov
38c6f6ce16 Allow specifying custom endpoint in s3 2022-01-20 18:42:47 +02:00
Heikki Linnakangas
caa62eff2a Fix description of proxy --auth-endpoint option. 2022-01-20 14:50:27 +03:00
Dmitry Ivanov
d3542c34f1 Refactoring: use anyhow::Context's methods where possible 2022-01-19 16:33:48 +03:00
Kirill Bulatov
7fb62fc849 Fix macos compilation 2022-01-18 23:01:04 +02:00
Andrey Taranik
9d6ae06663 monitoring turn on for proxy (#1146) 2022-01-18 19:23:53 +03:00
Alexey Kondratov
06c28174c2 Integrate compute_tools into zenith workspace and improve logging (zenithdb/console#487) 2022-01-18 18:47:31 +03:00
bojanserafimov
8af1b43074 proxy: Add new metrics (#1132) 2022-01-14 19:12:43 -05:00
Heikki Linnakangas
17b7caddcb Update vendor/postgres: silence excessive logging from walproposer. 2022-01-14 20:51:02 +02:00
Heikki Linnakangas
dab30c27b6 Refactor thread management and shutdown
This introduces a new module to handle thread creation and shutdown.
All page server threads are now registered in a global hash map, and
there's a function to request individual threads to shut down gracefully.

Thread shutdown request is signalled to the thread with a flag, as well
as a Future that can be used to wake up async operations if shutdown is
requested. Use that facility to have the libpq listener thread respond
to pageserver shutdown, based on Kirill's earlier prototype
(https://github.com/zenithdb/zenith/pull/1088). That addresses
https://github.com/zenithdb/zenith/issues/1036, previously the libpq
listener thread would not exit until one more connection arrives.

This also eliminates a resource leak in the accept() loop. Previously,
we added the JoinHanlde of each new thread to a vector but old handles
for threads that had already exited were never removed.
2022-01-14 18:36:10 +02:00
Heikki Linnakangas
bad1dd9759 Don't panic if spawning a new WAL receiver thread fails.
The panic would kill the page service thread. That's not too bad, but
still let's try to handle it more gracefully.
2022-01-14 18:02:34 +02:00
Heikki Linnakangas
d29836d0d5 Don't panic if spawning a thread to handle a connection fails.
Log the error and continue. Hopefully it's a transient failure.

This might have been happening in staging earlier, when the safekeeper
had a problem where it opened connections very frequently to issue
"callmemaybe" commands. If you launch too many threads too fast, you might
run out of file descriptors or something. It's not totally clear what
happened, but with commit, at least the page server will continue to run
and accept new connections, if a transient error happens.
2022-01-14 18:02:30 +02:00
Heikki Linnakangas
adb0b3dada Include backtrace in error messages in the log.
'anyhow' crate can include a backtrace in all errors, when the
'backtrace' feature is enabled. Enable it, and change the places that used
'{:#}' or '{}' to '{:?}', so that the backtrace is printed.
2022-01-14 10:10:17 +02:00
bojanserafimov
5e0f39cc9e Add proxy metrics (#1093) 2022-01-13 20:34:30 -05:00
Arthur Petukhovsky
0a34a592d5 Bump vendor/postgres (#1120) 2022-01-13 20:28:37 +03:00
Heikki Linnakangas
19aaa91f6d Timeline IDs are not globally unique, fix some code that assumed that.
A timeline ID is only guaranteed to be unique for a particular tenant,
so you need to use tenant ID + timeline ID as the key, rather than just
timeline ID.

The safekeeper currently makes the same assumption, and we should fix that
too, but this commit just addresses this one case in the page server.

In the passing, reorder some function arguments to be more consistent.
2022-01-13 18:45:30 +02:00
Konstantin Knizhnik
404aab9373 Use mutex to prevent concurrent checkpoints (#1115)
* Use mutex to prevent concurrent checkpoints

* Fix comment
2022-01-13 17:48:24 +03:00
Konstantin Knizhnik
bc6db2c10e Implement IO metrics in VirtualFile (#1112)
* Implement IO metrics in VirtualFile

* Do not group virtual file close statistics by tenantid/timelineid

* Add comments concenring close metrics
2022-01-13 17:36:53 +03:00
Heikki Linnakangas
772d853dcf Fix race condition leading to panic in walkeeper.
The walkeeper launch two threads for each connection, and uses a guard
object to remove entry from 'replicas' array, when finishes. But only
the background thread held onto the guard object, so if the background
thread finished before the other thread, the array entry would be
removed prematurely, which lead to panic in the check_stop_streaming()
call.

Fixes https://github.com/zenithdb/zenith/issues/1103
2022-01-13 11:21:11 +02:00
Arseny Sher
ab4d272149 Add safekeeper --dump-control-file option.
Hexalize zids there for better output; since Serde doesn't support several
formats for one struct, on-disk representation is changed as well, make
upgrade.rs cope with it.
2022-01-12 19:47:24 +03:00
Konstantin Knizhnik
f70a5cad61 Fix releasing of timelines lock (#1100)
refer #1087
2022-01-12 15:05:08 +03:00
anastasia
7aba299dbd Use safekeeper in test_branch_behind (#1068)
to avoid a subtle race condition.

Without safekeeper, walreceiver reconnection can stuck,
because of IO deadlock between walsender auth and regular backend.
2022-01-12 14:38:04 +03:00
Kirill Bulatov
4b3b19f444 Support prefixes when working with s3 buckets 2022-01-11 15:44:50 +02:00
Kirill Bulatov
8ab4c8a050 Code review fixes 2022-01-11 15:44:23 +02:00
Kirill Bulatov
7c4a653230 Propagate Zenith CLI's RUST_LOG env var to subprocesses 2022-01-11 15:44:23 +02:00
Kirill Bulatov
a3cd8f0e6d Add the remote storage test 2022-01-11 15:44:23 +02:00
Kirill Bulatov
65c851a451 Test pageserver's timeline http methods
z
2022-01-11 15:44:23 +02:00
Kirill Bulatov
23cf2fa984 Properly shutdown storage sync loop 2022-01-11 15:44:23 +02:00
Kirill Bulatov
ce8d6ae958 Allow using remote storage in tests 2022-01-11 15:44:23 +02:00
Kirill Bulatov
384b2a91fa Pass generic pageserver params through zenith cli 2022-01-11 15:44:23 +02:00
Arseny Sher
233c4811db Fix default safekeeper http port. 2022-01-11 10:13:27 +03:00
Konstantin Knizhnik
2fd4c390cb Do not hold timelines lock during GC (#1089)
* Do not hold timelines lock during GC
refer #1087

* Add gc_cs mutex for preveting creation of new timelines during GC

* Make clippy happy

* Use Mutex<()> instead of Mutex<i32> for GC critical section
2022-01-10 14:41:15 +03:00
117 changed files with 6995 additions and 3860 deletions

View File

@@ -1,28 +1,28 @@
version: 2.1 version: 2.1
executors: executors:
zenith-build-executor: zenith-xlarge-executor:
resource_class: xlarge resource_class: xlarge
docker: docker:
- image: cimg/rust:1.56.1 # NB: when changed, do not forget to update rust image tag in all Dockerfiles
zenith-python-executor: - image: zimg/rust:1.56
zenith-executor:
docker: docker:
- image: cimg/python:3.7.10 # Oldest available 3.7 with Ubuntu 20.04 (for GLIBC and Rust) at CirlceCI - image: zimg/rust:1.56
jobs: jobs:
check-codestyle-rust: check-codestyle-rust:
executor: zenith-build-executor executor: zenith-xlarge-executor
steps: steps:
- checkout - checkout
- run: - run:
name: rustfmt name: rustfmt
when: always when: always
command: | command: cargo fmt --all -- --check
cargo fmt --all -- --check
# A job to build postgres # A job to build postgres
build-postgres: build-postgres:
executor: zenith-build-executor executor: zenith-xlarge-executor
parameters: parameters:
build_type: build_type:
type: enum type: enum
@@ -37,8 +37,7 @@ jobs:
# Note this works even though the submodule hasn't been checkout out yet. # Note this works even though the submodule hasn't been checkout out yet.
- run: - run:
name: Get postgres cache key name: Get postgres cache key
command: | command: git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
- restore_cache: - restore_cache:
name: Restore postgres cache name: Restore postgres cache
@@ -46,15 +45,6 @@ jobs:
# Restore ONLY if the rev key matches exactly # Restore ONLY if the rev key matches exactly
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
# FIXME We could cache our own docker container, instead of installing packages every time.
- run:
name: apt install dependencies
command: |
if [ ! -e tmp_install/bin/postgres ]; then
sudo apt update
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
fi
# Build postgres if the restore_cache didn't find a build. # Build postgres if the restore_cache didn't find a build.
# `make` can't figure out whether the cache is valid, since # `make` can't figure out whether the cache is valid, since
# it only compares file timestamps. # it only compares file timestamps.
@@ -64,7 +54,7 @@ jobs:
if [ ! -e tmp_install/bin/postgres ]; then if [ ! -e tmp_install/bin/postgres ]; then
# "depth 1" saves some time by not cloning the whole repo # "depth 1" saves some time by not cloning the whole repo
git submodule update --init --depth 1 git submodule update --init --depth 1
make postgres -j8 mold -run make postgres -j$(nproc)
fi fi
- save_cache: - save_cache:
@@ -75,7 +65,7 @@ jobs:
# A job to build zenith rust code # A job to build zenith rust code
build-zenith: build-zenith:
executor: zenith-build-executor executor: zenith-xlarge-executor
parameters: parameters:
build_type: build_type:
type: enum type: enum
@@ -83,12 +73,6 @@ jobs:
environment: environment:
BUILD_TYPE: << parameters.build_type >> BUILD_TYPE: << parameters.build_type >>
steps: steps:
- run:
name: apt install dependencies
command: |
sudo apt update
sudo apt install libssl-dev clang
# Checkout the git repo (without submodules) # Checkout the git repo (without submodules)
- checkout - checkout
@@ -126,7 +110,7 @@ jobs:
fi fi
export CARGO_INCREMENTAL=0 export CARGO_INCREMENTAL=0
"${cov_prefix[@]}" cargo build $CARGO_FLAGS --bins --tests "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests
- save_cache: - save_cache:
name: Save rust cache name: Save rust cache
@@ -210,6 +194,14 @@ jobs:
command: | command: |
cp -a tmp_install /tmp/zenith/pg_install cp -a tmp_install /tmp/zenith/pg_install
- run:
name: Merge coverage data
command: |
# This will speed up workspace uploads
if [[ $BUILD_TYPE == "debug" ]]; then
scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
fi
# Save the rust binaries and coverage data for other jobs in this workflow. # Save the rust binaries and coverage data for other jobs in this workflow.
- persist_to_workspace: - persist_to_workspace:
root: /tmp/zenith root: /tmp/zenith
@@ -217,23 +209,30 @@ jobs:
- "*" - "*"
check-codestyle-python: check-codestyle-python:
executor: zenith-python-executor executor: zenith-executor
steps: steps:
- checkout - checkout
- restore_cache:
keys:
- v1-python-deps-{{ checksum "poetry.lock" }}
- run: - run:
name: Install deps name: Install deps
command: pipenv --python 3.7 install --dev command: ./scripts/pysync
- save_cache:
key: v1-python-deps-{{ checksum "poetry.lock" }}
paths:
- /home/circleci/.cache/pypoetry/virtualenvs
- run: - run:
name: Run yapf to ensure code format name: Run yapf to ensure code format
when: always when: always
command: pipenv run yapf --recursive --diff . command: poetry run yapf --recursive --diff .
- run: - run:
name: Run mypy to check types name: Run mypy to check types
when: always when: always
command: pipenv run mypy . command: poetry run mypy .
run-pytest: run-pytest:
executor: zenith-python-executor executor: zenith-executor
parameters: parameters:
# pytest args to specify the tests to run. # pytest args to specify the tests to run.
# #
@@ -272,9 +271,16 @@ jobs:
condition: << parameters.needs_postgres_source >> condition: << parameters.needs_postgres_source >>
steps: steps:
- run: git submodule update --init --depth 1 - run: git submodule update --init --depth 1
- restore_cache:
keys:
- v1-python-deps-{{ checksum "poetry.lock" }}
- run: - run:
name: Install deps name: Install deps
command: pipenv --python 3.7 install command: ./scripts/pysync
- save_cache:
key: v1-python-deps-{{ checksum "poetry.lock" }}
paths:
- /home/circleci/.cache/pypoetry/virtualenvs
- run: - run:
name: Run pytest name: Run pytest
# pytest doesn't output test logs in real time, so CI job may fail with # pytest doesn't output test logs in real time, so CI job may fail with
@@ -326,7 +332,7 @@ jobs:
# -n4 uses four processes to run tests via pytest-xdist # -n4 uses four processes to run tests via pytest-xdist
# -s is not used to prevent pytest from capturing output, because tests are running # -s is not used to prevent pytest from capturing output, because tests are running
# in parallel and logs are mixed between different tests # in parallel and logs are mixed between different tests
"${cov_prefix[@]}" pipenv run pytest \ "${cov_prefix[@]}" ./scripts/pytest \
--junitxml=$TEST_OUTPUT/junit.xml \ --junitxml=$TEST_OUTPUT/junit.xml \
--tb=short \ --tb=short \
--verbose \ --verbose \
@@ -356,6 +362,13 @@ jobs:
# The store_test_results step tells CircleCI where to find the junit.xml file. # The store_test_results step tells CircleCI where to find the junit.xml file.
- store_test_results: - store_test_results:
path: /tmp/test_output path: /tmp/test_output
- run:
name: Merge coverage data
command: |
# This will speed up workspace uploads
if [[ $BUILD_TYPE == "debug" ]]; then
scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
fi
# Save coverage data (if any) # Save coverage data (if any)
- persist_to_workspace: - persist_to_workspace:
root: /tmp/zenith root: /tmp/zenith
@@ -363,7 +376,7 @@ jobs:
- "*" - "*"
coverage-report: coverage-report:
executor: zenith-build-executor executor: zenith-xlarge-executor
steps: steps:
- attach_workspace: - attach_workspace:
at: /tmp/zenith at: /tmp/zenith
@@ -375,12 +388,6 @@ jobs:
# there's no way to clean out old packages, so the cache grows every time something # there's no way to clean out old packages, so the cache grows every time something
# changes. # changes.
- v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }} - v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
- run:
name: Install llvm-tools
command: |
# TODO: install a proper symbol demangler, e.g. rustfilt
# TODO: we should embed this into a docker image
rustup component add llvm-tools-preview
- run: - run:
name: Build coverage report name: Build coverage report
command: | command: |
@@ -443,27 +450,25 @@ jobs:
- checkout - checkout
- setup_remote_docker: - setup_remote_docker:
docker_layer_caching: true docker_layer_caching: true
- run:
name: Login to docker hub
command: echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
- run:
name: Setup buildx
command: docker run -it --rm --privileged tonistiigi/binfmt --install all
# Build zenithdb/compute-tools:latest image and push it to Docker hub # Build zenithdb/compute-tools:latest image and push it to Docker hub
# TODO: this should probably also use versioned tag, not just :latest. # TODO: this should probably also use versioned tag, not just :latest.
# XXX: but should it? We build and use it only locally now. # XXX: but should it? We build and use it only locally now.
- run: - run:
name: Build and push compute-tools Docker image name: Build and push compute-tools Docker image
command: docker buildx build --platform linux/amd64,linux/arm64 --push -t zenithdb/compute-tools:latest compute_tools command: |
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
docker build -t zenithdb/compute-tools:latest -f Dockerfile.compute-tools .
docker push zenithdb/compute-tools:latest
- run: - run:
name: Init postgres submodule name: Init postgres submodule
command: git submodule update --init --depth 1 command: git submodule update --init --depth 1
- run: - run:
name: Build and push compute-node Docker image name: Build and push compute-node Docker image
command: | command: |
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
DOCKER_TAG=$(git log --oneline|wc -l) DOCKER_TAG=$(git log --oneline|wc -l)
docker buildx build --platform linux/amd64,linux/arm64 --push -t zenithdb/compute-node:latest vendor/postgres docker build -t zenithdb/compute-node:latest vendor/postgres && docker push zenithdb/compute-node:latest
docker buildx build --platform linux/amd64,linux/arm64 --push -t zenithdb/compute-node:${DOCKER_TAG} vendor/postgres docker tag zenithdb/compute-node:latest zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG}
deploy-staging: deploy-staging:
docker: docker:
@@ -573,55 +578,6 @@ jobs:
} }
}" }"
#
#
# compute-tools jobs
# TODO: unify with main build_and_test pipeline
#
#
compute-tools-test:
executor: zenith-build-executor
working_directory: ~/repo/compute_tools
steps:
- checkout:
path: ~/repo
- restore_cache:
name: Restore rust cache
keys:
# Require an exact match. While an out of date cache might speed up the build,
# there's no way to clean out old packages, so the cache grows every time something
# changes.
- v03-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
# Build the rust code, including test binaries
- run:
name: Rust build
environment:
CARGO_INCREMENTAL: 0
command: cargo build --bins --tests
- save_cache:
name: Save rust cache
key: v03-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
paths:
- ~/.cargo/registry
- ~/.cargo/git
- target
# Run Rust formatting checks
- run:
name: cargo fmt check
command: cargo fmt --all -- --check
# Run Rust linter (clippy)
- run:
name: cargo clippy check
command: cargo clippy --all --all-targets -- -Dwarnings -Drust-2018-idioms
# Run Rust integration and unittests
- run: cargo test
workflows: workflows:
build_and_test: build_and_test:
jobs: jobs:
@@ -670,7 +626,6 @@ workflows:
requires: requires:
# TODO: consider adding more # TODO: consider adding more
- other-tests-debug - other-tests-debug
- compute-tools-test
- docker-image: - docker-image:
# Context gives an ability to login # Context gives an ability to login
context: Docker Hub context: Docker Hub
@@ -690,11 +645,9 @@ workflows:
branches: branches:
only: only:
- main - main
- docker-multi-platform requires:
# requires: - pg_regress-tests-release
# - pg_regress-tests-release - other-tests-release
# - other-tests-release
# - compute-tools-test
- deploy-staging: - deploy-staging:
# Context gives an ability to login # Context gives an ability to login
context: Docker Hub context: Docker Hub

View File

@@ -5,9 +5,23 @@ settings:
authEndpoint: "https://console.stage.zenith.tech/authenticate_proxy_request/" authEndpoint: "https://console.stage.zenith.tech/authenticate_proxy_request/"
uri: "https://console.stage.zenith.tech/psql_session/" uri: "https://console.stage.zenith.tech/psql_session/"
# -- Additional labels for zenith-proxy pods
podLabels:
zenith_service: proxy
zenith_env: staging
zenith_region: us-east-1
zenith_region_slug: virginia
exposedService: exposedService:
annotations: annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: start.stage.zenith.tech external-dns.alpha.kubernetes.io/hostname: start.stage.zenith.tech
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack

View File

@@ -36,20 +36,20 @@ jobs:
# see https://github.com/actions/setup-python/issues/162 # see https://github.com/actions/setup-python/issues/162
# and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular # and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
# so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs. # so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
# there is Python 3.7.10 already installed on the machine so use it to install pipenv and then use pipenv's virtuealenvs # there is Python 3.7.10 already installed on the machine so use it to install poetry and then use poetry's virtuealenvs
- name: Install pipenv & deps - name: Install poetry & deps
run: | run: |
python3 -m pip install --upgrade pipenv wheel python3 -m pip install --upgrade poetry wheel
# since pip/pipenv caches are reused there shouldn't be any troubles with install every time # since pip/poetry caches are reused there shouldn't be any troubles with install every time
pipenv install poetry install
- name: Show versions - name: Show versions
run: | run: |
echo Python echo Python
python3 --version python3 --version
pipenv run python3 --version poetry run python3 --version
echo Pipenv echo Pipenv
pipenv --version poetry --version
echo Pgbench echo Pgbench
$PG_BIN/pgbench --version $PG_BIN/pgbench --version
@@ -90,7 +90,7 @@ jobs:
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
run: | run: |
mkdir -p perf-report-staging mkdir -p perf-report-staging
pipenv run pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging
- name: Submit result - name: Submit result
env: env:

View File

@@ -1,44 +0,0 @@
## Build docker image zenithdb/build:buster for linux/adm64 and linux/arm64 platforms
name: docker-builder
on:
push:
branches:
- 'docker-multi-platform'
schedule:
# * is a special character in YAML so you have to quote this string
# buil daily at 5:30am
- cron: '30 5 * * *'
jobs:
docker-builder-buster:
runs-on: ubuntu-latest
steps:
-
name: Checkout
uses: actions/checkout@v2
with:
submodules: false
-
name: Set up QEMU
uses: docker/setup-qemu-action@v1
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
-
name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push zenithdb/build:buster
uses: docker/build-push-action@v2
with:
push: true
file: Dockerfile.build
platforms: linux/amd64,linux/arm64
cache-from: type=registry,ref=zenithdb/build:buster
tags: zenithdb/build:buster

261
Cargo.lock generated
View File

@@ -2,12 +2,38 @@
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3 version = 3
[[package]]
name = "addr2line"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9ecd88a8c8378ca913a680cd98f0f13ac67383d35993f86c90a70e3f137816b"
dependencies = [
"gimli",
]
[[package]]
name = "adler"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]] [[package]]
name = "ahash" name = "ahash"
version = "0.4.7" version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e" checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e"
[[package]]
name = "ahash"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
dependencies = [
"getrandom",
"once_cell",
"version_check",
]
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "0.7.18" version = "0.7.18"
@@ -40,6 +66,9 @@ name = "anyhow"
version = "1.0.44" version = "1.0.44"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61604a8f862e1d5c3229fdd78f8b02c68dcf73a4c4b05fd636d12240aaa242c1" checksum = "61604a8f862e1d5c3229fdd78f8b02c68dcf73a4c4b05fd636d12240aaa242c1"
dependencies = [
"backtrace",
]
[[package]] [[package]]
name = "async-compression" name = "async-compression"
@@ -57,9 +86,9 @@ dependencies = [
[[package]] [[package]]
name = "async-trait" name = "async-trait"
version = "0.1.51" version = "0.1.52"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44318e776df68115a881de9a8fd1b9e53368d7a4a5ce4cc48517da3393233a5e" checksum = "061a7acccaa286c011ddc30970520b98fa40e00c9d644633fb26b5fc63a265e3"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@@ -149,6 +178,21 @@ dependencies = [
"anyhow", "anyhow",
] ]
[[package]]
name = "backtrace"
version = "0.3.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "321629d8ba6513061f26707241fa9bc89524ff1cd7a915a97ef0c62c666ce1b6"
dependencies = [
"addr2line",
"cc",
"cfg-if",
"libc",
"miniz_oxide",
"object",
"rustc-demangle",
]
[[package]] [[package]]
name = "base64" name = "base64"
version = "0.12.3" version = "0.12.3"
@@ -331,6 +375,25 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "compute_tools"
version = "0.1.0"
dependencies = [
"anyhow",
"chrono",
"clap",
"env_logger",
"hyper",
"libc",
"log",
"postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
"regex",
"serde",
"serde_json",
"tar",
"tokio",
]
[[package]] [[package]]
name = "const_format" name = "const_format"
version = "0.2.22" version = "0.2.22"
@@ -359,7 +422,7 @@ dependencies = [
"lazy_static", "lazy_static",
"nix", "nix",
"pageserver", "pageserver",
"postgres", "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"regex", "regex",
"reqwest", "reqwest",
"serde", "serde",
@@ -655,6 +718,12 @@ dependencies = [
"wasi", "wasi",
] ]
[[package]]
name = "gimli"
version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78cc372d058dcf6d5ecd98510e7fbc9e5aec4d21de70f65fea8fecebcd881bd4"
[[package]] [[package]]
name = "git-version" name = "git-version"
version = "0.3.5" version = "0.3.5"
@@ -714,7 +783,7 @@ version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04"
dependencies = [ dependencies = [
"ahash", "ahash 0.4.7",
] ]
[[package]] [[package]]
@@ -722,6 +791,9 @@ name = "hashbrown"
version = "0.11.2" version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
dependencies = [
"ahash 0.7.6",
]
[[package]] [[package]]
name = "hermit-abi" name = "hermit-abi"
@@ -922,7 +994,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "afabcc15e437a6484fc4f12d0fd63068fe457bf93f1c148d3d9649c60b103f32" checksum = "afabcc15e437a6484fc4f12d0fd63068fe457bf93f1c148d3d9649c60b103f32"
dependencies = [ dependencies = [
"base64 0.12.3", "base64 0.12.3",
"pem", "pem 0.8.3",
"ring", "ring",
"serde", "serde",
"serde_json", "serde_json",
@@ -982,6 +1054,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"serde",
] ]
[[package]] [[package]]
@@ -1048,6 +1121,16 @@ version = "0.3.16"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
[[package]]
name = "miniz_oxide"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b"
dependencies = [
"adler",
"autocfg",
]
[[package]] [[package]]
name = "mio" name = "mio"
version = "0.7.13" version = "0.7.13"
@@ -1144,6 +1227,15 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "object"
version = "0.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67ac1d3f9a1d3616fd9a60c8d74296f22406a238b6a72f5cc1e6f314df4ffbf9"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "once_cell" name = "once_cell"
version = "1.8.0" version = "1.8.0"
@@ -1192,9 +1284,9 @@ dependencies = [
"nix", "nix",
"once_cell", "once_cell",
"parking_lot", "parking_lot",
"postgres", "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"postgres-protocol", "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"postgres-types", "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"postgres_ffi", "postgres_ffi",
"rand", "rand",
"regex", "regex",
@@ -1208,7 +1300,7 @@ dependencies = [
"tempfile", "tempfile",
"thiserror", "thiserror",
"tokio", "tokio",
"tokio-postgres", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"tokio-stream", "tokio-stream",
"toml_edit", "toml_edit",
"tracing", "tracing",
@@ -1261,6 +1353,15 @@ dependencies = [
"regex", "regex",
] ]
[[package]]
name = "pem"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9a3b09a20e374558580a4914d3b7d89bd61b954a5a5e1dcbea98753addb1947"
dependencies = [
"base64 0.13.0",
]
[[package]] [[package]]
name = "percent-encoding" name = "percent-encoding"
version = "2.1.0" version = "2.1.0"
@@ -1317,6 +1418,20 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "postgres"
version = "0.19.1"
source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7"
dependencies = [
"bytes",
"fallible-iterator",
"futures",
"log",
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"tokio",
"tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
]
[[package]] [[package]]
name = "postgres" name = "postgres"
version = "0.19.1" version = "0.19.1"
@@ -1326,9 +1441,27 @@ dependencies = [
"fallible-iterator", "fallible-iterator",
"futures", "futures",
"log", "log",
"postgres-protocol", "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
"tokio", "tokio",
"tokio-postgres", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
]
[[package]]
name = "postgres-protocol"
version = "0.6.1"
source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7"
dependencies = [
"base64 0.13.0",
"byteorder",
"bytes",
"fallible-iterator",
"hmac 0.10.1",
"lazy_static",
"md-5",
"memchr",
"rand",
"sha2",
"stringprep",
] ]
[[package]] [[package]]
@@ -1349,6 +1482,16 @@ dependencies = [
"stringprep", "stringprep",
] ]
[[package]]
name = "postgres-types"
version = "0.2.1"
source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7"
dependencies = [
"bytes",
"fallible-iterator",
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
]
[[package]] [[package]]
name = "postgres-types" name = "postgres-types"
version = "0.2.1" version = "0.2.1"
@@ -1356,7 +1499,7 @@ source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b
dependencies = [ dependencies = [
"bytes", "bytes",
"fallible-iterator", "fallible-iterator",
"postgres-protocol", "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
] ]
[[package]] [[package]]
@@ -1421,19 +1564,31 @@ name = "proxy"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-trait",
"base64 0.13.0",
"bytes", "bytes",
"clap", "clap",
"futures",
"hashbrown 0.11.2",
"hex", "hex",
"hyper",
"lazy_static", "lazy_static",
"md5", "md5",
"parking_lot", "parking_lot",
"pin-project-lite",
"rand", "rand",
"rcgen",
"reqwest", "reqwest",
"routerify",
"rustls 0.19.1", "rustls 0.19.1",
"scopeguard",
"serde", "serde",
"serde_json", "serde_json",
"tokio", "tokio",
"tokio-postgres", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"tokio-postgres-rustls",
"tokio-rustls",
"zenith_metrics",
"zenith_utils", "zenith_utils",
] ]
@@ -1492,6 +1647,18 @@ dependencies = [
"rand_core", "rand_core",
] ]
[[package]]
name = "rcgen"
version = "0.8.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5911d1403f4143c9d56a702069d593e8d0f3fab880a85e103604d0893ea31ba7"
dependencies = [
"chrono",
"pem 1.0.2",
"ring",
"yasna",
]
[[package]] [[package]]
name = "redox_syscall" name = "redox_syscall"
version = "0.2.10" version = "0.2.10"
@@ -1650,6 +1817,12 @@ dependencies = [
"url", "url",
] ]
[[package]]
name = "rustc-demangle"
version = "0.1.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342"
[[package]] [[package]]
name = "rustc-hash" name = "rustc-hash"
version = "1.1.0" version = "1.1.0"
@@ -2079,6 +2252,28 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "tokio-postgres"
version = "0.7.1"
source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7"
dependencies = [
"async-trait",
"byteorder",
"bytes",
"fallible-iterator",
"futures",
"log",
"parking_lot",
"percent-encoding",
"phf",
"pin-project-lite",
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"socket2",
"tokio",
"tokio-util",
]
[[package]] [[package]]
name = "tokio-postgres" name = "tokio-postgres"
version = "0.7.1" version = "0.7.1"
@@ -2094,13 +2289,28 @@ dependencies = [
"percent-encoding", "percent-encoding",
"phf", "phf",
"pin-project-lite", "pin-project-lite",
"postgres-protocol", "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
"postgres-types", "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
"socket2", "socket2",
"tokio", "tokio",
"tokio-util", "tokio-util",
] ]
[[package]]
name = "tokio-postgres-rustls"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bd8c37d8c23cb6ecdc32fc171bade4e9c7f1be65f693a17afbaad02091a0a19"
dependencies = [
"futures",
"ring",
"rustls 0.19.1",
"tokio",
"tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"tokio-rustls",
"webpki 0.21.4",
]
[[package]] [[package]]
name = "tokio-rustls" name = "tokio-rustls"
version = "0.22.0" version = "0.22.0"
@@ -2332,8 +2542,8 @@ dependencies = [
"humantime", "humantime",
"hyper", "hyper",
"lazy_static", "lazy_static",
"postgres", "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"postgres-protocol", "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"postgres_ffi", "postgres_ffi",
"regex", "regex",
"routerify", "routerify",
@@ -2343,7 +2553,7 @@ dependencies = [
"signal-hook", "signal-hook",
"tempfile", "tempfile",
"tokio", "tokio",
"tokio-postgres", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"tracing", "tracing",
"walkdir", "walkdir",
"workspace_hack", "workspace_hack",
@@ -2573,6 +2783,15 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3" checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3"
[[package]]
name = "yasna"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e262a29d0e61ccf2b6190d7050d4b237535fc76ce4c1210d9caa316f71dffa75"
dependencies = [
"chrono",
]
[[package]] [[package]]
name = "zenith" name = "zenith"
version = "0.1.0" version = "0.1.0"
@@ -2581,7 +2800,7 @@ dependencies = [
"clap", "clap",
"control_plane", "control_plane",
"pageserver", "pageserver",
"postgres", "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"postgres_ffi", "postgres_ffi",
"serde_json", "serde_json",
"walkeeper", "walkeeper",
@@ -2614,7 +2833,9 @@ dependencies = [
"jsonwebtoken", "jsonwebtoken",
"lazy_static", "lazy_static",
"nix", "nix",
"postgres", "pin-project-lite",
"postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)",
"rand", "rand",
"routerify", "routerify",
"rustls 0.19.1", "rustls 0.19.1",

View File

@@ -1,5 +1,6 @@
[workspace] [workspace]
members = [ members = [
"compute_tools",
"control_plane", "control_plane",
"pageserver", "pageserver",
"postgres_ffi", "postgres_ffi",
@@ -15,3 +16,8 @@ members = [
# This is useful for profiling and, to some extent, debug. # This is useful for profiling and, to some extent, debug.
# Besides, debug info should not affect the performance. # Besides, debug info should not affect the performance.
debug = true debug = true
# This is only needed for proxy's tests
# TODO: we should probably fork tokio-postgres-rustls instead
[patch.crates-io]
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }

View File

@@ -2,8 +2,9 @@
# Image with all the required dependencies to build https://github.com/zenithdb/zenith # Image with all the required dependencies to build https://github.com/zenithdb/zenith
# and Postgres from https://github.com/zenithdb/postgres # and Postgres from https://github.com/zenithdb/postgres
# Also includes some rust development and build tools. # Also includes some rust development and build tools.
# NB: keep in sync with rust image version in .circle/config.yml
# #
FROM rust:slim-buster FROM rust:1.56.1-slim-buster
WORKDIR /zenith WORKDIR /zenith
# Install postgres and zenith build dependencies # Install postgres and zenith build dependencies

14
Dockerfile.compute-tools Normal file
View File

@@ -0,0 +1,14 @@
# First transient image to build compute_tools binaries
# NB: keep in sync with rust image version in .circle/config.yml
FROM rust:1.56.1-slim-buster AS rust-build
WORKDIR /zenith
COPY . .
RUN cargo build -p compute_tools --release
# Final image that only has one binary
FROM debian:buster-slim
COPY --from=rust-build /zenith/target/release/zenith_ctl /usr/local/bin/zenith_ctl

30
Pipfile
View File

@@ -1,30 +0,0 @@
[[source]]
url = "https://pypi.python.org/simple"
verify_ssl = true
name = "pypi"
[packages]
pytest = ">=6.0.0"
typing-extensions = "*"
pyjwt = {extras = ["crypto"], version = "*"}
requests = "*"
pytest-xdist = "*"
asyncpg = "*"
cached-property = "*"
psycopg2-binary = "*"
jinja2 = "*"
[dev-packages]
# Behavior may change slightly between versions. These are run continuously,
# so we pin exact versions to avoid suprising breaks. Update if comfortable.
yapf = "==0.31.0"
mypy = "==0.910"
# Non-pinned packages follow.
pipenv = "*"
flake8 = "*"
types-requests = "*"
types-psycopg2 = "*"
[requires]
# we need at least 3.7, but pipenv doesn't allow to say this directly
python_version = "3"

652
Pipfile.lock generated
View File

@@ -1,652 +0,0 @@
{
"_meta": {
"hash": {
"sha256": "c309cb963a7b07ae3d30e9cbf08b495f77bdecc0e5356fc89d133c4fbcb65b2b"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.python.org/simple",
"verify_ssl": true
}
]
},
"default": {
"asyncpg": {
"hashes": [
"sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
"sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
"sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
"sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
"sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
"sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
"sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
"sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
"sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
"sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
"sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
"sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
"sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
],
"index": "pypi",
"version": "==0.24.0"
},
"attrs": {
"hashes": [
"sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
"sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.2.0"
},
"cached-property": {
"hashes": [
"sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
"sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
],
"index": "pypi",
"version": "==1.5.2"
},
"certifi": {
"hashes": [
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
],
"version": "==2021.10.8"
},
"cffi": {
"hashes": [
"sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
"sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
"sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
"sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
"sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
"sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
"sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
"sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
"sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
"sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
"sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
"sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
"sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
"sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
"sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
"sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
"sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
"sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
"sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
"sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
"sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
"sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
"sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
"sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
"sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
"sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
"sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
"sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
"sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
"sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
"sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
"sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
"sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
"sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
"sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
"sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
"sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
"sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
"sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
"sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
"sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
"sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
"sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
"sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
"sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
"sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
"sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
"sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
"sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
"sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
],
"version": "==1.15.0"
},
"charset-normalizer": {
"hashes": [
"sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
"sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
],
"markers": "python_version >= '3'",
"version": "==2.0.7"
},
"cryptography": {
"hashes": [
"sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
"sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
"sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
"sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
"sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
"sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
"sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
"sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
"sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
"sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
"sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
"sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
"sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
"sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
"sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
"sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
"sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
"sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
"sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
"sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
],
"version": "==35.0.0"
},
"execnet": {
"hashes": [
"sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
"sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.9.0"
},
"idna": {
"hashes": [
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
],
"markers": "python_version >= '3'",
"version": "==3.3"
},
"importlib-metadata": {
"hashes": [
"sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
"sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
],
"markers": "python_version < '3.8'",
"version": "==4.8.1"
},
"iniconfig": {
"hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
"sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
],
"version": "==1.1.1"
},
"jinja2": {
"hashes": [
"sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
"sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
],
"index": "pypi",
"version": "==3.0.2"
},
"markupsafe": {
"hashes": [
"sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
"sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64",
"sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b",
"sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194",
"sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567",
"sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff",
"sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724",
"sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74",
"sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646",
"sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35",
"sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6",
"sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a",
"sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6",
"sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad",
"sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26",
"sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38",
"sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac",
"sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7",
"sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6",
"sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047",
"sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75",
"sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f",
"sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b",
"sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135",
"sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8",
"sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a",
"sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a",
"sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1",
"sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9",
"sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864",
"sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914",
"sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee",
"sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f",
"sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18",
"sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8",
"sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2",
"sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d",
"sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b",
"sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b",
"sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86",
"sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6",
"sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f",
"sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb",
"sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833",
"sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28",
"sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e",
"sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415",
"sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902",
"sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f",
"sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d",
"sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9",
"sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d",
"sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145",
"sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066",
"sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c",
"sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1",
"sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a",
"sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207",
"sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f",
"sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53",
"sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd",
"sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134",
"sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85",
"sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9",
"sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5",
"sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94",
"sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509",
"sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51",
"sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.1"
},
"packaging": {
"hashes": [
"sha256:096d689d78ca690e4cd8a89568ba06d07ca097e3306a4381635073ca91479966",
"sha256:14317396d1e8cdb122989b916fa2c7e9ca8e2be9e8060a6eff75b6b7b4d8a7e0"
],
"markers": "python_version >= '3.6'",
"version": "==21.2"
},
"pluggy": {
"hashes": [
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
],
"markers": "python_version >= '3.6'",
"version": "==1.0.0"
},
"psycopg2-binary": {
"hashes": [
"sha256:0b7dae87f0b729922e06f85f667de7bf16455d411971b2043bbd9577af9d1975",
"sha256:0f2e04bd2a2ab54fa44ee67fe2d002bb90cee1c0f1cc0ebc3148af7b02034cbd",
"sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616",
"sha256:1473c0215b0613dd938db54a653f68251a45a78b05f6fc21af4326f40e8360a2",
"sha256:14db1752acdd2187d99cb2ca0a1a6dfe57fc65c3281e0f20e597aac8d2a5bd90",
"sha256:1e3a362790edc0a365385b1ac4cc0acc429a0c0d662d829a50b6ce743ae61b5a",
"sha256:1e85b74cbbb3056e3656f1cc4781294df03383127a8114cbc6531e8b8367bf1e",
"sha256:20f1ab44d8c352074e2d7ca67dc00843067788791be373e67a0911998787ce7d",
"sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f",
"sha256:2f62c207d1740b0bde5c4e949f857b044818f734a3d57f1d0d0edc65050532ed",
"sha256:3242b9619de955ab44581a03a64bdd7d5e470cc4183e8fcadd85ab9d3756ce7a",
"sha256:35c4310f8febe41f442d3c65066ca93cccefd75013df3d8c736c5b93ec288140",
"sha256:4235f9d5ddcab0b8dbd723dca56ea2922b485ea00e1dafacf33b0c7e840b3d32",
"sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759",
"sha256:5ced67f1e34e1a450cdb48eb53ca73b60aa0af21c46b9b35ac3e581cf9f00e31",
"sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e",
"sha256:7360647ea04db2e7dff1648d1da825c8cf68dc5fbd80b8fb5b3ee9f068dcd21a",
"sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c",
"sha256:8c13d72ed6af7fd2c8acbd95661cf9477f94e381fce0792c04981a8283b52917",
"sha256:988b47ac70d204aed01589ed342303da7c4d84b56c2f4c4b8b00deda123372bf",
"sha256:995fc41ebda5a7a663a254a1dcac52638c3e847f48307b5416ee373da15075d7",
"sha256:a36c7eb6152ba5467fb264d73844877be8b0847874d4822b7cf2d3c0cb8cdcb0",
"sha256:aed4a9a7e3221b3e252c39d0bf794c438dc5453bc2963e8befe9d4cd324dff72",
"sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698",
"sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773",
"sha256:b4d7679a08fea64573c969f6994a2631908bb2c0e69a7235648642f3d2e39a68",
"sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76",
"sha256:ca86db5b561b894f9e5f115d6a159fff2a2570a652e07889d8a383b5fae66eb4",
"sha256:cfc523edecddaef56f6740d7de1ce24a2fdf94fd5e704091856a201872e37f9f",
"sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a",
"sha256:da113b70f6ec40e7d81b43d1b139b9db6a05727ab8be1ee559f3a69854a69d34",
"sha256:f6fac64a38f6768e7bc7b035b9e10d8a538a9fadce06b983fb3e6fa55ac5f5ce",
"sha256:f8559617b1fcf59a9aedba2c9838b5b6aa211ffedecabca412b92a1ff75aac1a",
"sha256:fbb42a541b1093385a2d8c7eec94d26d30437d0e77c1d25dae1dcc46741a385e"
],
"index": "pypi",
"version": "==2.9.1"
},
"py": {
"hashes": [
"sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
"sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.10.0"
},
"pycparser": {
"hashes": [
"sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
"sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.20"
},
"pyjwt": {
"extras": [
"crypto"
],
"hashes": [
"sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
"sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
],
"index": "pypi",
"version": "==2.3.0"
},
"pyparsing": {
"hashes": [
"sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
"sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.4.7"
},
"pytest": {
"hashes": [
"sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
"sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
],
"index": "pypi",
"version": "==6.2.5"
},
"pytest-forked": {
"hashes": [
"sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
"sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.3.0"
},
"pytest-xdist": {
"hashes": [
"sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
"sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
],
"index": "pypi",
"version": "==2.4.0"
},
"requests": {
"hashes": [
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
"sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
],
"index": "pypi",
"version": "==2.26.0"
},
"toml": {
"hashes": [
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.10.2"
},
"typing-extensions": {
"hashes": [
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
],
"index": "pypi",
"version": "==3.10.0.2"
},
"urllib3": {
"hashes": [
"sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
"sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.7"
},
"zipp": {
"hashes": [
"sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
"sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
],
"markers": "python_version >= '3.6'",
"version": "==3.6.0"
}
},
"develop": {
"backports.entry-points-selectable": {
"hashes": [
"sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
"sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
],
"markers": "python_version >= '2.7'",
"version": "==1.1.0"
},
"certifi": {
"hashes": [
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
],
"version": "==2021.10.8"
},
"distlib": {
"hashes": [
"sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
"sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05"
],
"version": "==0.3.3"
},
"filelock": {
"hashes": [
"sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
"sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
],
"markers": "python_version >= '3.6'",
"version": "==3.3.2"
},
"flake8": {
"hashes": [
"sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
"sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
],
"index": "pypi",
"version": "==4.0.1"
},
"importlib-metadata": {
"hashes": [
"sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
"sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
],
"markers": "python_version < '3.8'",
"version": "==4.8.1"
},
"mccabe": {
"hashes": [
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
"sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
],
"version": "==0.6.1"
},
"mypy": {
"hashes": [
"sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
"sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
"sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
"sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
"sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
"sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
"sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
"sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
"sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
"sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
"sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
"sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
"sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
"sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
"sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
"sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
"sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
"sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
"sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
"sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
"sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
"sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
"sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
],
"index": "pypi",
"version": "==0.910"
},
"mypy-extensions": {
"hashes": [
"sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
"sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
],
"version": "==0.4.3"
},
"pipenv": {
"hashes": [
"sha256:05958fadcd70b2de6a27542fcd2bd72dd5c59c6d35307fdac3e06361fb06e30e",
"sha256:d180f5be4775c552fd5e69ae18a9d6099d9dafb462efe54f11c72cb5f4d5e977"
],
"index": "pypi",
"version": "==2021.5.29"
},
"platformdirs": {
"hashes": [
"sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
"sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
],
"markers": "python_version >= '3.6'",
"version": "==2.4.0"
},
"pycodestyle": {
"hashes": [
"sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
"sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==2.8.0"
},
"pyflakes": {
"hashes": [
"sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
"sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.4.0"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0"
},
"toml": {
"hashes": [
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.10.2"
},
"typed-ast": {
"hashes": [
"sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
"sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
"sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
"sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
"sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
"sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
"sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
"sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
"sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
"sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
"sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
"sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
"sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
"sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
"sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
"sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
"sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
"sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
"sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
"sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
"sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
"sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
"sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
"sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
"sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
"sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
"sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
"sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
"sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
"sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
],
"markers": "python_version < '3.8'",
"version": "==1.4.3"
},
"types-psycopg2": {
"hashes": [
"sha256:77ed80f2668582654623e04fb3d741ecce93effcc39c929d7e02f4a917a538ce",
"sha256:98a6e0e9580cd7eb4bd4d20f7c7063d154b2589a2b90c0ce4e3ca6085cde77c6"
],
"index": "pypi",
"version": "==2.9.1"
},
"types-requests": {
"hashes": [
"sha256:b279284e51f668e38ee12d9665e4d789089f532dc2a0be4a1508ca0efd98ba9e",
"sha256:ba1d108d512e294b6080c37f6ae7cb2a2abf527560e2b671d1786c1fc46b541a"
],
"index": "pypi",
"version": "==2.25.11"
},
"typing-extensions": {
"hashes": [
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
],
"index": "pypi",
"version": "==3.10.0.2"
},
"virtualenv": {
"hashes": [
"sha256:4b02e52a624336eece99c96e3ab7111f469c24ba226a53ec474e8e787b365814",
"sha256:576d05b46eace16a9c348085f7d0dc8ef28713a2cabaa1cf0aea41e8f12c9218"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==20.10.0"
},
"virtualenv-clone": {
"hashes": [
"sha256:418ee935c36152f8f153c79824bb93eaf6f0f7984bae31d3f48f350b9183501a",
"sha256:44d5263bceed0bac3e1424d64f798095233b64def1c5689afa43dc3223caf5b0"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.5.7"
},
"yapf": {
"hashes": [
"sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
"sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
],
"index": "pypi",
"version": "==0.31.0"
},
"zipp": {
"hashes": [
"sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
"sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
],
"markers": "python_version >= '3.6'",
"version": "==3.6.0"
}
}
}

View File

@@ -28,12 +28,12 @@ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libsec
libssl-dev clang pkg-config libpq-dev libssl-dev clang pkg-config libpq-dev
``` ```
[Rust] 1.55 or later is also required. [Rust] 1.56.1 or later is also required.
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively. To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
To run the integration tests or Python scripts (not required to use the code), install To run the integration tests or Python scripts (not required to use the code), install
Python (3.7 or higher), and install python3 packages using `pipenv install` in the project directory. Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
2. Build zenith and patched postgres 2. Build zenith and patched postgres
```sh ```sh
@@ -128,8 +128,7 @@ INSERT 0 1
```sh ```sh
git clone --recursive https://github.com/zenithdb/zenith.git git clone --recursive https://github.com/zenithdb/zenith.git
make # builds also postgres and installs it to ./tmp_install make # builds also postgres and installs it to ./tmp_install
cd test_runner ./scripts/pytest
pipenv run pytest
``` ```
## Documentation ## Documentation

1161
compute_tools/Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -2,13 +2,10 @@
name = "compute_tools" name = "compute_tools"
version = "0.1.0" version = "0.1.0"
authors = ["Alexey Kondratov <kondratov.aleksey@gmail.com>"] authors = ["Alexey Kondratov <kondratov.aleksey@gmail.com>"]
edition = "2018" edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[workspace]
# TODO: make it a part of global zenith worksapce
[dependencies] [dependencies]
libc = "0.2" libc = "0.2"
anyhow = "1.0" anyhow = "1.0"
@@ -17,12 +14,9 @@ clap = "2.33"
env_logger = "0.8" env_logger = "0.8"
hyper = { version = "0.14", features = ["full"] } hyper = { version = "0.14", features = ["full"] }
log = { version = "0.4", features = ["std", "serde"] } log = { version = "0.4", features = ["std", "serde"] }
postgres = "0.19" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
regex = "1" regex = "1"
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0" serde_json = "1"
tar = "0.4" tar = "0.4"
tokio = { version = "1", features = ["full"] } tokio = { version = "1", features = ["macros", "rt", "rt-multi-thread"] }
[profile.release]
debug = true

View File

@@ -1,14 +0,0 @@
# First transient image to build compute_tools binaries
FROM rust:slim-buster AS rust-build
RUN mkdir /compute_tools
WORKDIR /compute_tools
COPY . /compute_tools/
RUN cargo build --release
# Final image that only has one binary
FROM debian:buster-slim
COPY --from=rust-build /compute_tools/target/release/zenith_ctl /usr/local/bin/zenith_ctl

View File

@@ -27,14 +27,13 @@
//! ``` //! ```
//! //!
use std::fs::File; use std::fs::File;
use std::panic;
use std::path::Path; use std::path::Path;
use std::process::{exit, Command, ExitStatus}; use std::process::{exit, Command, ExitStatus};
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use std::{env, panic};
use anyhow::Result; use anyhow::{Context, Result};
use chrono::Utc; use chrono::Utc;
use libc::{prctl, PR_SET_PDEATHSIG, SIGINT};
use log::info; use log::info;
use postgres::{Client, NoTls}; use postgres::{Client, NoTls};
@@ -70,7 +69,7 @@ fn prepare_pgdata(state: &Arc<RwLock<ComputeState>>) -> Result<()> {
.expect("tenant id should be provided"); .expect("tenant id should be provided");
info!( info!(
"applying spec for cluster #{}, operation #{}", "starting cluster #{}, operation #{}",
spec.cluster.cluster_id, spec.cluster.cluster_id,
spec.operation_uuid.as_ref().unwrap() spec.operation_uuid.as_ref().unwrap()
); );
@@ -80,10 +79,23 @@ fn prepare_pgdata(state: &Arc<RwLock<ComputeState>>) -> Result<()> {
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
info!("starting safekeepers syncing"); info!("starting safekeepers syncing");
let lsn = sync_safekeepers(&state.pgdata, &state.pgbin)?; let lsn = sync_safekeepers(&state.pgdata, &state.pgbin)
.with_context(|| "failed to sync safekeepers")?;
info!("safekeepers synced at LSN {}", lsn); info!("safekeepers synced at LSN {}", lsn);
get_basebackup(&state.pgdata, &pageserver_connstr, &tenant, &timeline, &lsn)?; info!(
"getting basebackup@{} from pageserver {}",
lsn, pageserver_connstr
);
get_basebackup(&state.pgdata, &pageserver_connstr, &tenant, &timeline, &lsn).with_context(
|| {
format!(
"failed to get basebackup@{} from pageserver {}",
lsn, pageserver_connstr
)
},
)?;
// Update pg_hba.conf received with basebackup. // Update pg_hba.conf received with basebackup.
update_pg_hba(pgdata_path)?; update_pg_hba(pgdata_path)?;
@@ -142,22 +154,13 @@ fn run_compute(state: &Arc<RwLock<ComputeState>>) -> Result<ExitStatus> {
} }
fn main() -> Result<()> { fn main() -> Result<()> {
// During configuration we are starting Postgres as a child process. If we
// fail we do not want to leave it running. PR_SET_PDEATHSIG sets the signal
// that will be sent to the child process when the parent dies. NB: this is
// cleared for the child of a fork(). SIGINT means fast shutdown for Postgres.
// This does not matter much for Docker, where `zenith_ctl` is an entrypoint,
// so the whole container will exit if it exits. But could be useful when
// `zenith_ctl` is used in e.g. systemd.
unsafe {
prctl(PR_SET_PDEATHSIG, SIGINT);
}
// TODO: re-use `zenith_utils::logging` later // TODO: re-use `zenith_utils::logging` later
init_logger(DEFAULT_LOG_LEVEL)?; init_logger(DEFAULT_LOG_LEVEL)?;
// Env variable is set by `cargo`
let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
let matches = clap::App::new("zenith_ctl") let matches = clap::App::new("zenith_ctl")
.version("0.1.0") .version(version.unwrap_or("unknown"))
.arg( .arg(
clap::Arg::with_name("connstr") clap::Arg::with_name("connstr")
.short("C") .short("C")
@@ -212,13 +215,7 @@ fn main() -> Result<()> {
let file = File::open(path)?; let file = File::open(path)?;
serde_json::from_reader(file)? serde_json::from_reader(file)?
} else { } else {
// Finally, try to fetch it from the env panic!("cluster spec should be provided via --spec or --spec-path argument");
// XXX: not tested well and kept as a backup option for k8s, Docker, etc.
// TODO: remove later
match env::var("CLUSTER_SPEC") {
Ok(json) => serde_json::from_str(&json)?,
Err(_) => panic!("cluster spec should be provided via --spec, --spec-path or env variable CLUSTER_SPEC")
}
} }
} }
}; };

View File

@@ -5,7 +5,7 @@ use std::process::Command;
use std::str::FromStr; use std::str::FromStr;
use std::{fs, thread, time}; use std::{fs, thread, time};
use anyhow::{anyhow, Result}; use anyhow::{bail, Result};
use postgres::{Client, Transaction}; use postgres::{Client, Transaction};
use serde::Deserialize; use serde::Deserialize;
@@ -226,7 +226,7 @@ pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> {
// but postgres starts listening almost immediately, even if it is not really // but postgres starts listening almost immediately, even if it is not really
// ready to accept connections). // ready to accept connections).
if slept >= POSTGRES_WAIT_TIMEOUT { if slept >= POSTGRES_WAIT_TIMEOUT {
return Err(anyhow!("timed out while waiting for Postgres to start")); bail!("timed out while waiting for Postgres to start");
} }
if pid_path.exists() { if pid_path.exists() {

View File

@@ -87,17 +87,19 @@ pub fn sync_safekeepers(pgdata: &str, pgbin: &str) -> Result<String> {
.args(&["--sync-safekeepers"]) .args(&["--sync-safekeepers"])
.env("PGDATA", &pgdata) // we cannot use -D in this mode .env("PGDATA", &pgdata) // we cannot use -D in this mode
.stdout(Stdio::piped()) .stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn() .spawn()
.expect("postgres --sync-safekeepers failed to start"); .expect("postgres --sync-safekeepers failed to start");
// `postgres --sync-safekeepers` will print all log output to stderr and
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
// redirected to the caller output.
let sync_output = sync_handle let sync_output = sync_handle
.wait_with_output() .wait_with_output()
.expect("postgres --sync-safekeepers failed"); .expect("postgres --sync-safekeepers failed");
if !sync_output.status.success() { if !sync_output.status.success() {
anyhow::bail!( anyhow::bail!(
"postgres --sync-safekeepers failed: '{}'", "postgres --sync-safekeepers exited with non-zero status: {}",
String::from_utf8_lossy(&sync_output.stderr) sync_output.status,
); );
} }

View File

@@ -2,13 +2,13 @@
name = "control_plane" name = "control_plane"
version = "0.1.0" version = "0.1.0"
authors = ["Stas Kelvich <stas@zenith.tech>"] authors = ["Stas Kelvich <stas@zenith.tech>"]
edition = "2018" edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
tar = "0.4.33" tar = "0.4.33"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
toml = "0.5" toml = "0.5"
lazy_static = "1.4" lazy_static = "1.4"

View File

@@ -82,15 +82,11 @@ impl ComputeControlPlane {
let mut strings = s.split('@'); let mut strings = s.split('@');
let name = strings.next().unwrap(); let name = strings.next().unwrap();
let lsn: Option<Lsn>; let lsn = strings
if let Some(lsnstr) = strings.next() { .next()
lsn = Some( .map(Lsn::from_str)
Lsn::from_str(lsnstr) .transpose()
.with_context(|| "invalid LSN in point-in-time specification")?, .context("invalid LSN in point-in-time specification")?;
);
} else {
lsn = None
}
// Resolve the timeline ID, given the human-readable branch name // Resolve the timeline ID, given the human-readable branch name
let timeline_id = self let timeline_id = self
@@ -253,16 +249,16 @@ impl PostgresNode {
let mut client = self let mut client = self
.pageserver .pageserver
.page_server_psql_client() .page_server_psql_client()
.with_context(|| "connecting to page server failed")?; .context("connecting to page server failed")?;
let copyreader = client let copyreader = client
.copy_out(sql.as_str()) .copy_out(sql.as_str())
.with_context(|| "page server 'basebackup' command failed")?; .context("page server 'basebackup' command failed")?;
// Read the archive directly from the `CopyOutReader` // Read the archive directly from the `CopyOutReader`
tar::Archive::new(copyreader) tar::Archive::new(copyreader)
.unpack(&self.pgdata()) .unpack(&self.pgdata())
.with_context(|| "extracting base backup failed")?; .context("extracting base backup failed")?;
Ok(()) Ok(())
} }
@@ -443,7 +439,7 @@ impl PostgresNode {
if let Some(token) = auth_token { if let Some(token) = auth_token {
cmd.env("ZENITH_AUTH_TOKEN", token); cmd.env("ZENITH_AUTH_TOKEN", token);
} }
let pg_ctl = cmd.status().with_context(|| "pg_ctl failed")?; let pg_ctl = cmd.status().context("pg_ctl failed")?;
if !pg_ctl.success() { if !pg_ctl.success() {
anyhow::bail!("pg_ctl failed"); anyhow::bail!("pg_ctl failed");

View File

@@ -9,6 +9,7 @@
use anyhow::{anyhow, bail, Context, Result}; use anyhow::{anyhow, bail, Context, Result};
use std::fs; use std::fs;
use std::path::Path; use std::path::Path;
use std::process::Command;
pub mod compute; pub mod compute;
pub mod local_env; pub mod local_env;
@@ -31,3 +32,19 @@ pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
} }
Ok(pid) Ok(pid)
} }
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
let var = "LLVM_PROFILE_FILE";
if let Some(val) = std::env::var_os(var) {
cmd.env(var, val);
}
const RUST_LOG_KEY: &str = "RUST_LOG";
if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
cmd.env(RUST_LOG_KEY, rust_log_value)
} else {
cmd
}
}

View File

@@ -251,7 +251,7 @@ impl LocalEnv {
.arg("2048") .arg("2048")
.stdout(Stdio::null()) .stdout(Stdio::null())
.output() .output()
.with_context(|| "failed to generate auth private key")?; .context("failed to generate auth private key")?;
if !keygen_output.status.success() { if !keygen_output.status.success() {
bail!( bail!(
"openssl failed: '{}'", "openssl failed: '{}'",
@@ -270,7 +270,7 @@ impl LocalEnv {
.args(&["-out", public_key_path.to_str().unwrap()]) .args(&["-out", public_key_path.to_str().unwrap()])
.stdout(Stdio::null()) .stdout(Stdio::null())
.output() .output()
.with_context(|| "failed to generate auth private key")?; .context("failed to generate auth private key")?;
if !keygen_output.status.success() { if !keygen_output.status.success() {
bail!( bail!(
"openssl failed: '{}'", "openssl failed: '{}'",

View File

@@ -4,7 +4,7 @@
/// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
/// enough to extract a few settings we need in Zenith, assuming you don't do /// enough to extract a few settings we need in Zenith, assuming you don't do
/// funny stuff like include-directives or funny escaping. /// funny stuff like include-directives or funny escaping.
use anyhow::{anyhow, bail, Context, Result}; use anyhow::{bail, Context, Result};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use regex::Regex; use regex::Regex;
use std::collections::HashMap; use std::collections::HashMap;
@@ -78,7 +78,7 @@ impl PostgresConf {
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static, <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
{ {
self.get(field_name) self.get(field_name)
.ok_or_else(|| anyhow!("could not find '{}' option {}", field_name, context))? .with_context(|| format!("could not find '{}' option {}", field_name, context))?
.parse::<T>() .parse::<T>()
.with_context(|| format!("could not parse '{}' option {}", field_name, context)) .with_context(|| format!("could not parse '{}' option {}", field_name, context))
} }

View File

@@ -17,8 +17,8 @@ use thiserror::Error;
use zenith_utils::http::error::HttpErrorBody; use zenith_utils::http::error::HttpErrorBody;
use crate::local_env::{LocalEnv, SafekeeperConf}; use crate::local_env::{LocalEnv, SafekeeperConf};
use crate::read_pidfile;
use crate::storage::PageServerNode; use crate::storage::PageServerNode;
use crate::{fill_rust_env_vars, read_pidfile};
use zenith_utils::connstring::connection_address; use zenith_utils::connstring::connection_address;
#[derive(Error, Debug)] #[derive(Error, Debug)]
@@ -118,22 +118,17 @@ impl SafekeeperNode {
let listen_http = format!("localhost:{}", self.conf.http_port); let listen_http = format!("localhost:{}", self.conf.http_port);
let mut cmd = Command::new(self.env.safekeeper_bin()?); let mut cmd = Command::new(self.env.safekeeper_bin()?);
cmd.args(&["-D", self.datadir_path().to_str().unwrap()]) fill_rust_env_vars(
.args(&["--listen-pg", &listen_pg]) cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
.args(&["--listen-http", &listen_http]) .args(&["--listen-pg", &listen_pg])
.args(&["--recall", "1 second"]) .args(&["--listen-http", &listen_http])
.arg("--daemonize") .args(&["--recall", "1 second"])
.env_clear() .arg("--daemonize"),
.env("RUST_BACKTRACE", "1"); );
if !self.conf.sync { if !self.conf.sync {
cmd.arg("--no-sync"); cmd.arg("--no-sync");
} }
let var = "LLVM_PROFILE_FILE";
if let Some(val) = std::env::var_os(var) {
cmd.env(var, val);
}
if !cmd.status()?.success() { if !cmd.status()?.success() {
bail!( bail!(
"Safekeeper failed to start. See '{}' for details.", "Safekeeper failed to start. See '{}' for details.",

View File

@@ -19,7 +19,7 @@ use zenith_utils::postgres_backend::AuthType;
use zenith_utils::zid::ZTenantId; use zenith_utils::zid::ZTenantId;
use crate::local_env::LocalEnv; use crate::local_env::LocalEnv;
use crate::read_pidfile; use crate::{fill_rust_env_vars, read_pidfile};
use pageserver::branches::BranchInfo; use pageserver::branches::BranchInfo;
use pageserver::tenant_mgr::TenantInfo; use pageserver::tenant_mgr::TenantInfo;
use zenith_utils::connstring::connection_address; use zenith_utils::connstring::connection_address;
@@ -96,46 +96,49 @@ impl PageServerNode {
.unwrap() .unwrap()
} }
pub fn init(&self, create_tenant: Option<&str>) -> anyhow::Result<()> { pub fn init(
&self,
create_tenant: Option<&str>,
config_overrides: &[&str],
) -> anyhow::Result<()> {
let mut cmd = Command::new(self.env.pageserver_bin()?); let mut cmd = Command::new(self.env.pageserver_bin()?);
let var = "LLVM_PROFILE_FILE";
if let Some(val) = std::env::var_os(var) {
cmd.env(var, val);
}
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
let mut args = vec![ let base_data_dir_param = self.env.base_data_dir.display().to_string();
"--init".to_string(), let pg_distrib_dir_param =
"-D".to_string(), format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display());
self.env.base_data_dir.display().to_string(), let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type);
"-c".to_string(), let listen_http_addr_param = format!(
format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display()), "listen_http_addr='{}'",
"-c".to_string(), self.env.pageserver.listen_http_addr
format!("auth_type='{}'", self.env.pageserver.auth_type), );
"-c".to_string(), let listen_pg_addr_param =
format!( format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
"listen_http_addr='{}'", let mut args = Vec::with_capacity(20);
self.env.pageserver.listen_http_addr
), args.push("--init");
"-c".to_string(), args.extend(["-D", &base_data_dir_param]);
format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr), args.extend(["-c", &pg_distrib_dir_param]);
]; args.extend(["-c", &authg_type_param]);
args.extend(["-c", &listen_http_addr_param]);
args.extend(["-c", &listen_pg_addr_param]);
for config_override in config_overrides {
args.extend(["-c", config_override]);
}
if self.env.pageserver.auth_type != AuthType::Trust { if self.env.pageserver.auth_type != AuthType::Trust {
args.extend([ args.extend([
"-c".to_string(), "-c",
"auth_validation_public_key_path='auth_public_key.pem'".to_string(), "auth_validation_public_key_path='auth_public_key.pem'",
]); ]);
} }
if let Some(tenantid) = create_tenant { if let Some(tenantid) = create_tenant {
args.extend(["--create-tenant".to_string(), tenantid.to_string()]) args.extend(["--create-tenant", tenantid])
} }
let status = cmd let status = fill_rust_env_vars(cmd.args(args))
.args(args)
.env_clear()
.env("RUST_BACKTRACE", "1")
.status() .status()
.expect("pageserver init failed"); .expect("pageserver init failed");
@@ -154,7 +157,7 @@ impl PageServerNode {
self.repo_path().join("pageserver.pid") self.repo_path().join("pageserver.pid")
} }
pub fn start(&self) -> anyhow::Result<()> { pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
print!( print!(
"Starting pageserver at '{}' in '{}'", "Starting pageserver at '{}' in '{}'",
connection_address(&self.pg_connection_config), connection_address(&self.pg_connection_config),
@@ -163,16 +166,16 @@ impl PageServerNode {
io::stdout().flush().unwrap(); io::stdout().flush().unwrap();
let mut cmd = Command::new(self.env.pageserver_bin()?); let mut cmd = Command::new(self.env.pageserver_bin()?);
cmd.args(&["-D", self.repo_path().to_str().unwrap()])
.arg("--daemonize")
.env_clear()
.env("RUST_BACKTRACE", "1");
let var = "LLVM_PROFILE_FILE"; let repo_path = self.repo_path();
if let Some(val) = std::env::var_os(var) { let mut args = vec!["-D", repo_path.to_str().unwrap()];
cmd.env(var, val);
for config_override in config_overrides {
args.extend(["-c", config_override]);
} }
fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
if !cmd.status()?.success() { if !cmd.status()?.success() {
bail!( bail!(
"Pageserver failed to start. See '{}' for details.", "Pageserver failed to start. See '{}' for details.",

View File

@@ -2,6 +2,16 @@
### Authentication ### Authentication
### Backpresssure
Backpressure is used to limit the lag between pageserver and compute node or WAL service.
If compute node or WAL service run far ahead of Page Server,
the time of serving page requests increases. This may lead to timeout errors.
To tune backpressure limits use `max_replication_write_lag`, `max_replication_flush_lag` and `max_replication_apply_lag` settings.
When lag between current LSN (pg_current_wal_flush_lsn() at compute node) and minimal write/flush/apply position of replica exceeds the limit
backends performing writes are blocked until the replica is caught up.
### Base image (page image) ### Base image (page image)
### Basebackup ### Basebackup
@@ -76,7 +86,37 @@ The layer map tracks what layers exist for all the relishes in a timeline.
Zenith repository implementation that keeps data in layers. Zenith repository implementation that keeps data in layers.
### LSN ### LSN
The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log.
The insert position is a byte offset into the logs, increasing monotonically with each new record.
Internally, an LSN is a 64-bit integer, representing a byte position in the write-ahead log stream.
It is printed as two hexadecimal numbers of up to 8 digits each, separated by a slash.
Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html)
Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery.
In postgres and Zenith lsns are used to describe certain points in WAL handling.
PostgreSQL LSNs and functions to monitor them:
* `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location.
* `pg_current_wal_lsn()` - Returns the current write-ahead log write location.
* `pg_current_wal_flush_lsn()` - Returns the current write-ahead log flush location.
* `pg_last_wal_receive_lsn()` - Returns the last write-ahead log location that has been received and synced to disk by streaming replication. While streaming replication is in progress this will increase monotonically.
* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
[source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):
Zenith safekeeper LSNs. For more check [walkeeper/README_PROTO.md](/walkeeper/README_PROTO.md)
* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
* `RestartLSN`: position in WAL confirmed by all safekeepers.
* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records.
Zenith pageserver LSNs:
* `last_record_lsn` - the end of last processed WAL record.
* `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN.
* `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash.
TODO: use this name consistently in remote storage code. Now `disk_consistent_lsn` is used and meaning depends on the context.
* `ancestor_lsn` - LSN of the branch point (the LSN at which this branch was created)
TODO: add table that describes mapping between PostgreSQL (compute), safekeeper and pageserver LSNs.
### Page (block) ### Page (block)
The basic structure used to store relation data. All pages are of the same size. The basic structure used to store relation data. All pages are of the same size.

View File

@@ -0,0 +1,22 @@
## Pageserver tenant migration
### Overview
This feature allows to migrate a timeline from one pageserver to another by utilizing remote storage capability.
### Migration process
Pageserver implements two new http handlers: timeline attach and timeline detach.
Timeline migration is performed in a following way:
1. Timeline attach is called on a target pageserver. This asks pageserver to download latest checkpoint uploaded to s3.
2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/zenithdb/zenith/issues/997)/[#1049](https://github.com/zenithdb/zenith/issues/1049))
3. Replication state can be tracked via timeline detail pageserver call.
4. Compute node should be restarted with new pageserver connection string. Issue with multiple compute nodes for one timeline is handled on the safekeeper consensus level. So this is not a problem here.Currently responsibility for rescheduling the compute with updated config lies on external coordinator (console).
5. Timeline is detached from old pageserver. On disk data is removed.
### Implementation details
Now safekeeper needs to track which pageserver it is replicating to. This introduces complications into replication code:
* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/zenithdb/zenith/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented).
* We need to track which pageserver is the primary. This is needed to avoid reconnections to non primary pageservers. Because we shouldn't reconnect to them when they decide to stop their walreceiver. I e this can appear when there is a load on the compute and we are trying to detach timeline from old pageserver. In this case callmemaybe will try to reconnect to it because replication termination condition is not met (page server with active compute could never catch up to the latest lsn, so there is always some wal tail)

View File

@@ -147,6 +147,10 @@ bucket_name = 'some-sample-bucket'
# Name of the region where the bucket is located at # Name of the region where the bucket is located at
bucket_region = 'eu-north-1' bucket_region = 'eu-north-1'
# A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once.
# Optional, pageserver uses entire bucket if the prefix is not specified.
prefix_in_bucket = '/some/prefix/'
# Access key to connect to the bucket ("login" part of the credentials) # Access key to connect to the bucket ("login" part of the credentials)
access_key_id = 'SOMEKEYAAAAASADSAH*#' access_key_id = 'SOMEKEYAAAAASADSAH*#'

View File

@@ -87,31 +87,29 @@ so manual installation of dependencies is not recommended.
A single virtual environment with all dependencies is described in the single `Pipfile`. A single virtual environment with all dependencies is described in the single `Pipfile`.
### Prerequisites ### Prerequisites
- Install Python 3.7 (the minimal supported version) - Install Python 3.7 (the minimal supported version) or greater.
- Later version (e.g. 3.8) is ok if you don't write Python code - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected.
- You can install Python 3.7 separately, e.g.: - If you have some trouble with other version you can resolve it by installing Python 3.7 separately, via pyenv or via system package manager e.g.:
```bash ```bash
# In Ubuntu # In Ubuntu
sudo add-apt-repository ppa:deadsnakes/ppa sudo add-apt-repository ppa:deadsnakes/ppa
sudo apt update sudo apt update
sudo apt install python3.7 sudo apt install python3.7
``` ```
- Install `pipenv` - Install `poetry`
- Exact version of `pipenv` is not important, you can use Debian/Ubuntu package `pipenv`. - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`.
- Install dependencies via either - Install dependencies via `./scripts/pysync`. Note that CI uses Python 3.7 so if you have different version some linting tools can yield different result locally vs in the CI.
* `pipenv --python 3.7 install --dev` if you will write Python code, or
* `pipenv install` if you only want to run Python scripts and don't have Python 3.7.
Run `pipenv shell` to activate the virtual environment. Run `poetry shell` to activate the virtual environment.
Alternatively, use `pipenv run` to run a single command in the venv, e.g. `pipenv run pytest`. Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
### Obligatory checks ### Obligatory checks
We force code formatting via `yapf` and type hints via `mypy`. We force code formatting via `yapf` and type hints via `mypy`.
Run the following commands in the repository's root (next to `setup.cfg`): Run the following commands in the repository's root (next to `setup.cfg`):
```bash ```bash
pipenv run yapf -ri . # All code is reformatted poetry run yapf -ri . # All code is reformatted
pipenv run mypy . # Ensure there are no typing errors poetry run mypy . # Ensure there are no typing errors
``` ```
**WARNING**: do not run `mypy` from a directory other than the root of the repository. **WARNING**: do not run `mypy` from a directory other than the root of the repository.
@@ -123,17 +121,6 @@ Also consider:
* Adding more type hints to your code to avoid `Any`. * Adding more type hints to your code to avoid `Any`.
### Changing dependencies ### Changing dependencies
You have to update `Pipfile.lock` if you have changed `Pipfile`: To add new package or change an existing one you can use `poetry add` or `poetry update` or edit `pyproject.toml` manually. Do not forget to run `poetry lock` in the latter case.
```bash More details are available in poetry's [documentation](https://python-poetry.org/docs/).
pipenv --python 3.7 install --dev # Re-create venv for Python 3.7 and install recent pipenv inside
pipenv run pipenv --version # Should be at least 2021.5.29
pipenv run pipenv lock # Regenerate Pipfile.lock
```
As the minimal supported version is Python 3.7 and we use it in CI,
you have to use a Python 3.7 environment when updating `Pipfile.lock`.
Otherwise some back-compatibility packages will be missing.
It is also important to run recent `pipenv`.
Older versions remove markers from `Pipfile.lock`.

View File

@@ -2,7 +2,7 @@
name = "pageserver" name = "pageserver"
version = "0.1.0" version = "0.1.0"
authors = ["Stas Kelvich <stas@zenith.tech>"] authors = ["Stas Kelvich <stas@zenith.tech>"]
edition = "2018" edition = "2021"
[dependencies] [dependencies]
bookfile = { git = "https://github.com/zenithdb/bookfile.git", branch="generic-readext" } bookfile = { git = "https://github.com/zenithdb/bookfile.git", branch="generic-readext" }
@@ -18,13 +18,13 @@ log = "0.4.14"
clap = "2.33.0" clap = "2.33.0"
daemonize = "0.4.1" daemonize = "0.4.1"
tokio = { version = "1.11", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } tokio = { version = "1.11", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
tokio-stream = "0.1.8" tokio-stream = "0.1.8"
routerify = "2" routerify = "2"
anyhow = "1.0" anyhow = { version = "1.0", features = ["backtrace"] }
crc32c = "0.6.0" crc32c = "0.6.0"
thiserror = "1.0" thiserror = "1.0"
hex = { version = "0.4.3", features = ["serde"] } hex = { version = "0.4.3", features = ["serde"] }

View File

@@ -129,13 +129,13 @@ There are the following implementations present:
* local filesystem — to use in tests mainly * local filesystem — to use in tests mainly
* AWS S3 - to use in production * AWS S3 - to use in production
Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs. Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md).
The backup service is disabled by default and can be enabled to interact with a single remote storage. The backup service is disabled by default and can be enabled to interact with a single remote storage.
CLI examples: CLI examples:
* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"` * Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"`
* AWS S3 : `${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1',access_key_id='SOMEKEYAAAAASADSAH*#',secret_access_key='SOMEsEcReTsd292v'}"` * AWS S3 : `${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/',access_key_id='SOMEKEYAAAAASADSAH*#',secret_access_key='SOMEsEcReTsd292v'}"`
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS. For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
For local S3 installations, refer to the their documentation for name format and credentials. For local S3 installations, refer to the their documentation for name format and credentials.
@@ -154,6 +154,7 @@ or
[remote_storage] [remote_storage]
bucket_name = 'some-sample-bucket' bucket_name = 'some-sample-bucket'
bucket_region = 'eu-north-1' bucket_region = 'eu-north-1'
prefix_in_bucket = '/test_prefix/'
access_key_id = 'SOMEKEYAAAAASADSAH*#' access_key_id = 'SOMEKEYAAAAASADSAH*#'
secret_access_key = 'SOMEsEcReTsd292v' secret_access_key = 'SOMEsEcReTsd292v'
``` ```

View File

@@ -1,6 +1,6 @@
//! Main entry point for the Page Server executable. //! Main entry point for the Page Server executable.
use std::{env, path::Path, str::FromStr, thread}; use std::{env, path::Path, str::FromStr};
use tracing::*; use tracing::*;
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION}; use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION};
@@ -12,7 +12,9 @@ use daemonize::Daemonize;
use pageserver::{ use pageserver::{
branches, branches,
config::{defaults::*, PageServerConf}, config::{defaults::*, PageServerConf},
http, page_cache, page_service, remote_storage, tenant_mgr, virtual_file, LOG_FILE_NAME, http, page_cache, page_service, remote_storage, tenant_mgr, thread_mgr,
thread_mgr::ThreadKind,
virtual_file, LOG_FILE_NAME,
}; };
use zenith_utils::http::endpoint; use zenith_utils::http::endpoint;
use zenith_utils::postgres_backend; use zenith_utils::postgres_backend;
@@ -53,12 +55,12 @@ fn main() -> Result<()> {
) )
// See `settings.md` for more details on the extra configuration patameters pageserver can process // See `settings.md` for more details on the extra configuration patameters pageserver can process
.arg( .arg(
Arg::with_name("config-option") Arg::with_name("config-override")
.short("c") .short("c")
.takes_value(true) .takes_value(true)
.number_of_values(1) .number_of_values(1)
.multiple(true) .multiple(true)
.help("Additional configuration options or overrides of the ones from the toml config file. .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there).
Any option has to be a valid toml document, example: `-c \"foo='hey'\"` `-c \"foo={value=1}\"`"), Any option has to be a valid toml document, example: `-c \"foo='hey'\"` `-c \"foo={value=1}\"`"),
) )
.get_matches(); .get_matches();
@@ -105,7 +107,7 @@ fn main() -> Result<()> {
}; };
// Process any extra options given with -c // Process any extra options given with -c
if let Some(values) = arg_matches.values_of("config-option") { if let Some(values) = arg_matches.values_of("config-override") {
for option_line in values { for option_line in values {
let doc = toml_edit::Document::from_str(option_line).with_context(|| { let doc = toml_edit::Document::from_str(option_line).with_context(|| {
format!( format!(
@@ -169,7 +171,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
); );
let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?; let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
// XXX: Don't spawn any threads before daemonizing! // NB: Don't spawn any threads before daemonizing!
if daemonize { if daemonize {
info!("daemonizing..."); info!("daemonizing...");
@@ -195,15 +197,9 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
} }
let signals = signals::install_shutdown_handlers()?; let signals = signals::install_shutdown_handlers()?;
let mut threads = Vec::new();
let sync_startup = remote_storage::start_local_timeline_sync(conf) let sync_startup = remote_storage::start_local_timeline_sync(conf)
.context("Failed to set up local files sync with external storage")?; .context("Failed to set up local files sync with external storage")?;
if let Some(handle) = sync_startup.sync_loop_handle {
threads.push(handle);
}
// Initialize tenant manager. // Initialize tenant manager.
tenant_mgr::set_timeline_states(conf, sync_startup.initial_timeline_states); tenant_mgr::set_timeline_states(conf, sync_startup.initial_timeline_states);
@@ -220,25 +216,27 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
// Spawn a new thread for the http endpoint // Spawn a new thread for the http endpoint
// bind before launching separate thread so the error reported before startup exits // bind before launching separate thread so the error reported before startup exits
let cloned = auth.clone(); let auth_cloned = auth.clone();
threads.push( thread_mgr::spawn(
thread::Builder::new() ThreadKind::HttpEndpointListener,
.name("http_endpoint_thread".into()) None,
.spawn(move || { None,
let router = http::make_router(conf, cloned); "http_endpoint_thread",
endpoint::serve_thread_main(router, http_listener) move || {
})?, let router = http::make_router(conf, auth_cloned);
); endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
},
)?;
// Spawn a thread to listen for connections. It will spawn further threads // Spawn a thread to listen for libpq connections. It will spawn further threads
// for each connection. // for each connection.
threads.push( thread_mgr::spawn(
thread::Builder::new() ThreadKind::LibpqEndpointListener,
.name("Page Service thread".into()) None,
.spawn(move || { None,
page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type) "libpq endpoint thread",
})?, move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type),
); )?;
signals.handle(|signal| match signal { signals.handle(|signal| match signal {
Signal::Quit => { Signal::Quit => {
@@ -254,20 +252,38 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
"Got {}. Terminating gracefully in fast shutdown mode", "Got {}. Terminating gracefully in fast shutdown mode",
signal.name() signal.name()
); );
shutdown_pageserver();
postgres_backend::set_pgbackend_shutdown_requested(); unreachable!()
tenant_mgr::shutdown_all_tenants()?;
endpoint::shutdown();
for handle in std::mem::take(&mut threads) {
handle
.join()
.expect("thread panicked")
.expect("thread exited with an error");
}
info!("Shut down successfully completed");
std::process::exit(0);
} }
}) })
} }
fn shutdown_pageserver() {
// Shut down the libpq endpoint thread. This prevents new connections from
// being accepted.
thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);
// Shut down any page service threads.
postgres_backend::set_pgbackend_shutdown_requested();
thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None);
// Shut down all the tenants. This flushes everything to disk and kills
// the checkpoint and GC threads.
tenant_mgr::shutdown_all_tenants();
// Stop syncing with remote storage.
//
// FIXME: Does this wait for the sync thread to finish syncing what's queued up?
// Should it?
thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None);
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None);
// There should be nothing left, but let's be sure
thread_mgr::shutdown_threads(None, None, None);
info!("Shut down successfully completed");
std::process::exit(0);
}

View File

@@ -4,7 +4,7 @@
// TODO: move all paths construction to conf impl // TODO: move all paths construction to conf impl
// //
use anyhow::{anyhow, bail, Context, Result}; use anyhow::{bail, Context, Result};
use postgres_ffi::ControlFileData; use postgres_ffi::ControlFileData;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::{ use std::{
@@ -45,14 +45,16 @@ impl BranchInfo {
repo: &Arc<dyn Repository>, repo: &Arc<dyn Repository>,
include_non_incremental_logical_size: bool, include_non_incremental_logical_size: bool,
) -> Result<Self> { ) -> Result<Self> {
let name = path let path = path.as_ref();
.as_ref() let name = path.file_name().unwrap().to_string_lossy().to_string();
.file_name() let timeline_id = std::fs::read_to_string(path)
.unwrap() .with_context(|| {
.to_str() format!(
.unwrap() "Failed to read branch file contents at path '{}'",
.to_string(); path.display()
let timeline_id = std::fs::read_to_string(path)?.parse::<ZTimelineId>()?; )
})?
.parse::<ZTimelineId>()?;
let timeline = match repo.get_timeline(timeline_id)? { let timeline = match repo.get_timeline(timeline_id)? {
RepositoryTimeline::Local(local_entry) => local_entry, RepositoryTimeline::Local(local_entry) => local_entry,
@@ -116,7 +118,7 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
if let Some(tenantid) = create_tenant { if let Some(tenantid) = create_tenant {
let tenantid = ZTenantId::from_str(tenantid)?; let tenantid = ZTenantId::from_str(tenantid)?;
println!("initializing tenantid {}", tenantid); println!("initializing tenantid {}", tenantid);
create_repo(conf, tenantid, dummy_redo_mgr).with_context(|| "failed to create repo")?; create_repo(conf, tenantid, dummy_redo_mgr).context("failed to create repo")?;
} }
crashsafe_dir::create_dir_all(conf.tenants_path())?; crashsafe_dir::create_dir_all(conf.tenants_path())?;
@@ -195,7 +197,7 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.stdout(Stdio::null()) .stdout(Stdio::null())
.output() .output()
.with_context(|| "failed to execute initdb")?; .context("failed to execute initdb")?;
if !initdb_output.status.success() { if !initdb_output.status.success() {
anyhow::bail!( anyhow::bail!(
"initdb failed: '{}'", "initdb failed: '{}'",
@@ -306,7 +308,7 @@ pub(crate) fn create_branch(
let timeline = repo let timeline = repo
.get_timeline(startpoint.timelineid)? .get_timeline(startpoint.timelineid)?
.local_timeline() .local_timeline()
.ok_or_else(|| anyhow!("Cannot branch off the timeline that's not present locally"))?; .context("Cannot branch off the timeline that's not present locally")?;
if startpoint.lsn == Lsn(0) { if startpoint.lsn == Lsn(0) {
// Find end of WAL on the old timeline // Find end of WAL on the old timeline
let end_of_wal = timeline.get_last_record_lsn(); let end_of_wal = timeline.get_last_record_lsn();
@@ -322,12 +324,13 @@ pub(crate) fn create_branch(
timeline.wait_lsn(startpoint.lsn)?; timeline.wait_lsn(startpoint.lsn)?;
} }
startpoint.lsn = startpoint.lsn.align(); startpoint.lsn = startpoint.lsn.align();
if timeline.get_start_lsn() > startpoint.lsn { if timeline.get_ancestor_lsn() > startpoint.lsn {
// can we safely just branch from the ancestor instead?
anyhow::bail!( anyhow::bail!(
"invalid startpoint {} for the branch {}: less than timeline start {}", "invalid startpoint {} for the branch {}: less than timeline ancestor lsn {:?}",
startpoint.lsn, startpoint.lsn,
branchname, branchname,
timeline.get_start_lsn() timeline.get_ancestor_lsn()
); );
} }
@@ -381,14 +384,11 @@ fn parse_point_in_time(
let mut strings = s.split('@'); let mut strings = s.split('@');
let name = strings.next().unwrap(); let name = strings.next().unwrap();
let lsn: Option<Lsn>; let lsn = strings
if let Some(lsnstr) = strings.next() { .next()
lsn = Some( .map(Lsn::from_str)
Lsn::from_str(lsnstr).with_context(|| "invalid LSN in point-in-time specification")?, .transpose()
); .context("invalid LSN in point-in-time specification")?;
} else {
lsn = None
}
// Check if it's a tag // Check if it's a tag
if lsn.is_none() { if lsn.is_none() {

View File

@@ -4,7 +4,7 @@
//! file, or on the command line. //! file, or on the command line.
//! See also `settings.md` for better description on every parameter. //! See also `settings.md` for better description on every parameter.
use anyhow::{anyhow, bail, ensure, Context, Result}; use anyhow::{bail, ensure, Context, Result};
use toml_edit; use toml_edit;
use toml_edit::{Document, Item}; use toml_edit::{Document, Item};
use zenith_utils::postgres_backend::AuthType; use zenith_utils::postgres_backend::AuthType;
@@ -135,6 +135,8 @@ pub struct S3Config {
pub bucket_name: String, pub bucket_name: String,
/// The region where the bucket is located at. /// The region where the bucket is located at.
pub bucket_region: String, pub bucket_region: String,
/// A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once.
pub prefix_in_bucket: Option<String>,
/// "Login" to use when connecting to bucket. /// "Login" to use when connecting to bucket.
/// Can be empty for cases like AWS k8s IAM /// Can be empty for cases like AWS k8s IAM
/// where we can allow certain pods to connect /// where we can allow certain pods to connect
@@ -142,6 +144,13 @@ pub struct S3Config {
pub access_key_id: Option<String>, pub access_key_id: Option<String>,
/// "Password" to use when connecting to bucket. /// "Password" to use when connecting to bucket.
pub secret_access_key: Option<String>, pub secret_access_key: Option<String>,
/// A base URL to send S3 requests to.
/// By default, the endpoint is derived from a region name, assuming it's
/// an AWS S3 region name, erroring on wrong region name.
/// Endpoint provides a way to support other S3 flavors and their regions.
///
/// Example: `http://127.0.0.1:5000`
pub endpoint: Option<String>,
} }
impl std::fmt::Debug for S3Config { impl std::fmt::Debug for S3Config {
@@ -149,6 +158,7 @@ impl std::fmt::Debug for S3Config {
f.debug_struct("S3Config") f.debug_struct("S3Config")
.field("bucket_name", &self.bucket_name) .field("bucket_name", &self.bucket_name)
.field("bucket_region", &self.bucket_region) .field("bucket_region", &self.bucket_region)
.field("prefix_in_bucket", &self.prefix_in_bucket)
.finish() .finish()
} }
} }
@@ -303,9 +313,7 @@ impl PageServerConf {
}) })
.ok() .ok()
.and_then(NonZeroUsize::new) .and_then(NonZeroUsize::new)
.ok_or_else(|| { .context("'max_concurrent_sync' must be a non-zero positive integer")?
anyhow!("'max_concurrent_sync' must be a non-zero positive integer")
})?
} else { } else {
NonZeroUsize::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap() NonZeroUsize::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap()
}; };
@@ -318,7 +326,7 @@ impl PageServerConf {
}) })
.ok() .ok()
.and_then(NonZeroU32::new) .and_then(NonZeroU32::new)
.ok_or_else(|| anyhow!("'max_sync_errors' must be a non-zero positive integer"))? .context("'max_sync_errors' must be a non-zero positive integer")?
} else { } else {
NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap() NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap()
}; };
@@ -332,18 +340,30 @@ impl PageServerConf {
bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
} }
(None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
bucket_name: bucket_name.as_str().unwrap().to_string(), bucket_name: parse_toml_string("bucket_name", bucket_name)?,
bucket_region: bucket_region.as_str().unwrap().to_string(), bucket_region: parse_toml_string("bucket_region", bucket_region)?,
access_key_id: toml access_key_id: toml
.get("access_key_id") .get("access_key_id")
.map(|x| x.as_str().unwrap().to_string()), .map(|access_key_id| parse_toml_string("access_key_id", access_key_id))
.transpose()?,
secret_access_key: toml secret_access_key: toml
.get("secret_access_key") .get("secret_access_key")
.map(|x| x.as_str().unwrap().to_string()), .map(|secret_access_key| {
parse_toml_string("secret_access_key", secret_access_key)
})
.transpose()?,
prefix_in_bucket: toml
.get("prefix_in_bucket")
.map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
.transpose()?,
endpoint: toml
.get("endpoint")
.map(|endpoint| parse_toml_string("endpoint", endpoint))
.transpose()?,
}), }),
(Some(local_path), None, None) => { (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
RemoteStorageKind::LocalFs(PathBuf::from(local_path.as_str().unwrap())) parse_toml_string("local_path", local_path)?,
} )),
(Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
}; };
@@ -385,7 +405,7 @@ impl PageServerConf {
fn parse_toml_string(name: &str, item: &Item) -> Result<String> { fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
let s = item let s = item
.as_str() .as_str()
.ok_or_else(|| anyhow!("configure option {} is not a string", name))?; .with_context(|| format!("configure option {} is not a string", name))?;
Ok(s.to_string()) Ok(s.to_string())
} }
@@ -394,7 +414,7 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
// for our use, though. // for our use, though.
let i: i64 = item let i: i64 = item
.as_integer() .as_integer()
.ok_or_else(|| anyhow!("configure option {} is not an integer", name))?; .with_context(|| format!("configure option {} is not an integer", name))?;
if i < 0 { if i < 0 {
bail!("configure option {} cannot be negative", name); bail!("configure option {} cannot be negative", name);
} }
@@ -404,7 +424,7 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> { fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
let s = item let s = item
.as_str() .as_str()
.ok_or_else(|| anyhow!("configure option {} is not a string", name))?; .with_context(|| format!("configure option {} is not a string", name))?;
Ok(humantime::parse_duration(s)?) Ok(humantime::parse_duration(s)?)
} }
@@ -412,7 +432,7 @@ fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
fn parse_toml_auth_type(name: &str, item: &Item) -> Result<AuthType> { fn parse_toml_auth_type(name: &str, item: &Item) -> Result<AuthType> {
let v = item let v = item
.as_str() .as_str()
.ok_or_else(|| anyhow!("configure option {} is not a string", name))?; .with_context(|| format!("configure option {} is not a string", name))?;
AuthType::from_str(v) AuthType::from_str(v)
} }
@@ -585,8 +605,10 @@ pg_distrib_dir='{}'
let bucket_name = "some-sample-bucket".to_string(); let bucket_name = "some-sample-bucket".to_string();
let bucket_region = "eu-north-1".to_string(); let bucket_region = "eu-north-1".to_string();
let prefix_in_bucket = "test_prefix".to_string();
let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string(); let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string();
let secret_access_key = "SOMEsEcReTsd292v".to_string(); let secret_access_key = "SOMEsEcReTsd292v".to_string();
let endpoint = "http://localhost:5000".to_string();
let max_concurrent_sync = NonZeroUsize::new(111).unwrap(); let max_concurrent_sync = NonZeroUsize::new(111).unwrap();
let max_sync_errors = NonZeroU32::new(222).unwrap(); let max_sync_errors = NonZeroU32::new(222).unwrap();
@@ -597,13 +619,15 @@ max_concurrent_sync = {}
max_sync_errors = {} max_sync_errors = {}
bucket_name = '{}' bucket_name = '{}'
bucket_region = '{}' bucket_region = '{}'
prefix_in_bucket = '{}'
access_key_id = '{}' access_key_id = '{}'
secret_access_key = '{}'"#, secret_access_key = '{}'
max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, access_key_id, secret_access_key endpoint = '{}'"#,
max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint
), ),
format!( format!(
"remote_storage={{max_concurrent_sync = {}, max_sync_errors = {}, bucket_name='{}', bucket_region='{}', access_key_id='{}', secret_access_key='{}'}}", "remote_storage={{max_concurrent_sync={}, max_sync_errors={}, bucket_name='{}', bucket_region='{}', prefix_in_bucket='{}', access_key_id='{}', secret_access_key='{}', endpoint='{}'}}",
max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, access_key_id, secret_access_key max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint
), ),
]; ];
@@ -637,6 +661,8 @@ pg_distrib_dir='{}'
bucket_region: bucket_region.clone(), bucket_region: bucket_region.clone(),
access_key_id: Some(access_key_id.clone()), access_key_id: Some(access_key_id.clone()),
secret_access_key: Some(secret_access_key.clone()), secret_access_key: Some(secret_access_key.clone()),
prefix_in_bucket: Some(prefix_in_bucket.clone()),
endpoint: Some(endpoint.clone())
}), }),
}, },
"Remote storage config should correctly parse the S3 config" "Remote storage config should correctly parse the S3 config"

View File

@@ -1,6 +1,6 @@
use std::sync::Arc; use std::sync::Arc;
use anyhow::{bail, Context, Result}; use anyhow::{Context, Result};
use hyper::header; use hyper::header;
use hyper::StatusCode; use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri}; use hyper::{Body, Request, Response, Uri};
@@ -25,6 +25,7 @@ use zenith_utils::zid::{opt_display_serde, ZTimelineId};
use super::models::BranchCreateRequest; use super::models::BranchCreateRequest;
use super::models::TenantCreateRequest; use super::models::TenantCreateRequest;
use crate::branches::BranchInfo; use crate::branches::BranchInfo;
use crate::repository::RepositoryTimeline;
use crate::repository::TimelineSyncState; use crate::repository::TimelineSyncState;
use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId}; use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId};
@@ -190,18 +191,26 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
} }
#[derive(Debug, Serialize)] #[derive(Debug, Serialize)]
struct TimelineInfo { #[serde(tag = "type")]
#[serde(with = "hex")] enum TimelineInfo {
timeline_id: ZTimelineId, Local {
#[serde(with = "hex")] #[serde(with = "hex")]
tenant_id: ZTenantId, timeline_id: ZTimelineId,
#[serde(with = "opt_display_serde")] #[serde(with = "hex")]
ancestor_timeline_id: Option<ZTimelineId>, tenant_id: ZTenantId,
last_record_lsn: Lsn, #[serde(with = "opt_display_serde")]
prev_record_lsn: Lsn, ancestor_timeline_id: Option<ZTimelineId>,
start_lsn: Lsn, last_record_lsn: Lsn,
disk_consistent_lsn: Lsn, prev_record_lsn: Lsn,
timeline_state: Option<TimelineSyncState>, disk_consistent_lsn: Lsn,
timeline_state: Option<TimelineSyncState>,
},
Remote {
#[serde(with = "hex")]
timeline_id: ZTimelineId,
#[serde(with = "hex")]
tenant_id: ZTenantId,
},
} }
async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> { async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -215,19 +224,21 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id) info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id)
.entered(); .entered();
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
match repo.get_timeline(timeline_id)?.local_timeline() { Ok::<_, anyhow::Error>(match repo.get_timeline(timeline_id)?.local_timeline() {
None => bail!("Timeline with id {} is not present locally", timeline_id), None => TimelineInfo::Remote {
Some(timeline) => Ok::<_, anyhow::Error>(TimelineInfo { timeline_id,
tenant_id,
},
Some(timeline) => TimelineInfo::Local {
timeline_id, timeline_id,
tenant_id, tenant_id,
ancestor_timeline_id: timeline.get_ancestor_timeline_id(), ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
disk_consistent_lsn: timeline.get_disk_consistent_lsn(), disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
last_record_lsn: timeline.get_last_record_lsn(), last_record_lsn: timeline.get_last_record_lsn(),
prev_record_lsn: timeline.get_prev_record_lsn(), prev_record_lsn: timeline.get_prev_record_lsn(),
start_lsn: timeline.get_start_lsn(),
timeline_state: repo.get_timeline_state(timeline_id), timeline_state: repo.get_timeline_state(timeline_id),
}), },
} })
}) })
.await .await
.map_err(ApiError::from_err)??; .map_err(ApiError::from_err)??;
@@ -235,6 +246,58 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
Ok(json_response(StatusCode::OK, response_data)?) Ok(json_response(StatusCode::OK, response_data)?)
} }
async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
tokio::task::spawn_blocking(move || {
let _enter =
info_span!("timeline_attach_handler", tenant = %tenant_id, timeline = %timeline_id)
.entered();
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
match repo.get_timeline(timeline_id)? {
RepositoryTimeline::Local(_) => {
anyhow::bail!("Timeline with id {} is already local", timeline_id)
}
RepositoryTimeline::Remote {
id: _,
disk_consistent_lsn: _,
} => {
// FIXME (rodionov) get timeline already schedules timeline for download, and duplicate tasks can cause errors
// first should be fixed in https://github.com/zenithdb/zenith/issues/997
// TODO (rodionov) change timeline state to awaits download (incapsulate it somewhere in the repo)
// TODO (rodionov) can we safely request replication on the timeline before sync is completed? (can be implemented on top of the #997)
Ok(())
}
}
})
.await
.map_err(ApiError::from_err)??;
Ok(json_response(StatusCode::ACCEPTED, ())?)
}
async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
tokio::task::spawn_blocking(move || {
let _enter =
info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id)
.entered();
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
repo.detach_timeline(timeline_id)
})
.await
.map_err(ApiError::from_err)??;
Ok(json_response(StatusCode::OK, ())?)
}
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> { async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
// check for management permission // check for management permission
check_permission(&request, None)?; check_permission(&request, None)?;
@@ -255,13 +318,13 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
let request_data: TenantCreateRequest = json_request(&mut request).await?; let request_data: TenantCreateRequest = json_request(&mut request).await?;
let response_data = tokio::task::spawn_blocking(move || { tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered(); let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered();
tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id) tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id)
}) })
.await .await
.map_err(ApiError::from_err)??; .map_err(ApiError::from_err)??;
Ok(json_response(StatusCode::CREATED, response_data)?) Ok(json_response(StatusCode::CREATED, ())?)
} }
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> { async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -296,6 +359,14 @@ pub fn make_router(
"/v1/timeline/:tenant_id/:timeline_id", "/v1/timeline/:tenant_id/:timeline_id",
timeline_detail_handler, timeline_detail_handler,
) )
.post(
"/v1/timeline/:tenant_id/:timeline_id/attach",
timeline_attach_handler,
)
.post(
"/v1/timeline/:tenant_id/:timeline_id/detach",
timeline_detach_handler,
)
.get("/v1/branch/:tenant_id", branch_list_handler) .get("/v1/branch/:tenant_id", branch_list_handler)
.get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler) .get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler)
.post("/v1/branch", branch_create_handler) .post("/v1/branch", branch_create_handler)

View File

@@ -7,7 +7,7 @@ use std::fs::File;
use std::io::{Read, Seek, SeekFrom}; use std::io::{Read, Seek, SeekFrom};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use anyhow::{anyhow, bail, ensure, Result}; use anyhow::{bail, ensure, Context, Result};
use bytes::Bytes; use bytes::Bytes;
use tracing::*; use tracing::*;
@@ -126,7 +126,7 @@ pub fn import_timeline_from_postgres_datadir(
writer.advance_last_record_lsn(lsn); writer.advance_last_record_lsn(lsn);
// We expect the Postgres server to be shut down cleanly. // We expect the Postgres server to be shut down cleanly.
let pg_control = pg_control.ok_or_else(|| anyhow!("pg_control file not found"))?; let pg_control = pg_control.context("pg_control file not found")?;
ensure!( ensure!(
pg_control.state == DBState_DB_SHUTDOWNED, pg_control.state == DBState_DB_SHUTDOWNED,
"Postgres cluster was not shut down cleanly" "Postgres cluster was not shut down cleanly"

View File

@@ -11,7 +11,7 @@
//! parent timeline, and the last LSN that has been written to disk. //! parent timeline, and the last LSN that has been written to disk.
//! //!
use anyhow::{anyhow, bail, ensure, Context, Result}; use anyhow::{bail, ensure, Context, Result};
use bookfile::Book; use bookfile::Book;
use bytes::Bytes; use bytes::Bytes;
use lazy_static::lazy_static; use lazy_static::lazy_static;
@@ -28,7 +28,7 @@ use std::io::Write;
use std::ops::{Bound::Included, Deref}; use std::ops::{Bound::Included, Deref};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::atomic::{self, AtomicBool, AtomicUsize}; use std::sync::atomic::{self, AtomicBool, AtomicUsize};
use std::sync::{Arc, Mutex, MutexGuard}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard};
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME};
@@ -40,8 +40,8 @@ use crate::repository::{
BlockNumber, GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, BlockNumber, GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState,
TimelineWriter, ZenithWalRecord, TimelineWriter, ZenithWalRecord,
}; };
use crate::tenant_mgr; use crate::thread_mgr;
use crate::walreceiver; use crate::virtual_file::VirtualFile;
use crate::walreceiver::IS_WAL_RECEIVER; use crate::walreceiver::IS_WAL_RECEIVER;
use crate::walredo::WalRedoManager; use crate::walredo::WalRedoManager;
use crate::CheckpointConfig; use crate::CheckpointConfig;
@@ -71,7 +71,6 @@ mod storage_layer;
use delta_layer::DeltaLayer; use delta_layer::DeltaLayer;
use ephemeral_file::is_ephemeral_file; use ephemeral_file::is_ephemeral_file;
use filename::{DeltaFileName, ImageFileName}; use filename::{DeltaFileName, ImageFileName};
use global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
use image_layer::ImageLayer; use image_layer::ImageLayer;
use inmemory_layer::InMemoryLayer; use inmemory_layer::InMemoryLayer;
use layer_map::LayerMap; use layer_map::LayerMap;
@@ -127,7 +126,13 @@ pub struct LayeredRepository {
conf: &'static PageServerConf, conf: &'static PageServerConf,
tenantid: ZTenantId, tenantid: ZTenantId,
timelines: Mutex<HashMap<ZTimelineId, LayeredTimelineEntry>>, timelines: Mutex<HashMap<ZTimelineId, LayeredTimelineEntry>>,
// This mutex prevents creation of new timelines during GC.
// Adding yet another mutex (in addition to `timelines`) is needed because holding
// `timelines` mutex during all GC iteration (especially with enforced checkpoint)
// may block for a long time `get_timeline`, `get_timelines_state`,... and other operations
// with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
// timeout...
gc_cs: Mutex<()>,
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>, walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
/// Makes every timeline to backup their files to remote storage. /// Makes every timeline to backup their files to remote storage.
upload_relishes: bool, upload_relishes: bool,
@@ -161,7 +166,7 @@ impl Repository for LayeredRepository {
// Create the timeline directory, and write initial metadata to file. // Create the timeline directory, and write initial metadata to file.
crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?; crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?;
let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), Lsn(0), initdb_lsn); let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn);
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?; Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?;
let timeline = LayeredTimeline::new( let timeline = LayeredTimeline::new(
@@ -186,6 +191,8 @@ impl Repository for LayeredRepository {
// We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
// about timelines, so otherwise a race condition is possible, where we create new timeline and GC // about timelines, so otherwise a race condition is possible, where we create new timeline and GC
// concurrently removes data that is needed by the new timeline. // concurrently removes data that is needed by the new timeline.
let _gc_cs = self.gc_cs.lock().unwrap();
let mut timelines = self.timelines.lock().unwrap(); let mut timelines = self.timelines.lock().unwrap();
let src_timeline = match self.get_or_init_timeline(src, &mut timelines)? { let src_timeline = match self.get_or_init_timeline(src, &mut timelines)? {
LayeredTimelineEntry::Local(timeline) => timeline, LayeredTimelineEntry::Local(timeline) => timeline,
@@ -193,9 +200,10 @@ impl Repository for LayeredRepository {
bail!("Cannot branch off the timeline {} that's not local", src) bail!("Cannot branch off the timeline {} that's not local", src)
} }
}; };
let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
src_timeline src_timeline
.check_lsn_is_in_scope(start_lsn) .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
.context("invalid branch start lsn")?; .context("invalid branch start lsn")?;
let RecordLsn { let RecordLsn {
@@ -223,7 +231,7 @@ impl Repository for LayeredRepository {
dst_prev, dst_prev,
Some(src), Some(src),
start_lsn, start_lsn,
src_timeline.latest_gc_cutoff_lsn.load(), *src_timeline.latest_gc_cutoff_lsn.read().unwrap(),
src_timeline.initdb_lsn, src_timeline.initdb_lsn,
); );
crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?; crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
@@ -277,15 +285,42 @@ impl Repository for LayeredRepository {
Ok(()) Ok(())
} }
// Wait for all threads to complete and persist repository data before pageserver shutdown. // Detaches the timeline from the repository.
fn shutdown(&self) -> Result<()> { fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> {
trace!("LayeredRepository shutdown for tenant {}", self.tenantid); let mut timelines = self.timelines.lock().unwrap();
match timelines.entry(timeline_id) {
Entry::Vacant(_) => {
bail!("cannot detach non existing timeline");
}
Entry::Occupied(mut entry) => {
let timeline_entry = entry.get_mut();
let timelines = self.timelines.lock().unwrap(); let timeline = match timeline_entry {
for (timelineid, timeline) in timelines.iter() { LayeredTimelineEntry::Remote { .. } => {
shutdown_timeline(self.tenantid, *timelineid, timeline)?; bail!("cannot detach remote timeline {}", timeline_id);
} }
LayeredTimelineEntry::Local(timeline) => timeline,
};
// TODO (rodionov) keep local state in timeline itself (refactoring related to https://github.com/zenithdb/zenith/issues/997 and #1104)
// FIXME this is local disk consistent lsn, need to keep the latest succesfully uploaded checkpoint lsn in timeline (metadata?)
// https://github.com/zenithdb/zenith/issues/1104
let remote_disk_consistent_lsn = timeline.disk_consistent_lsn.load();
// reference to timeline is dropped here
entry.insert(LayeredTimelineEntry::Remote {
id: timeline_id,
disk_consistent_lsn: remote_disk_consistent_lsn,
});
}
};
// Release the lock to shutdown and remove the files without holding it
drop(timelines);
// shutdown the timeline (this shuts down the walreceiver)
thread_mgr::shutdown_threads(None, Some(self.tenantid), Some(timeline_id));
// remove timeline files (maybe avoid this for ease of debugging if something goes wrong)
fs::remove_dir_all(self.conf.timeline_path(&timeline_id, &self.tenantid))?;
Ok(()) Ok(())
} }
@@ -298,9 +333,13 @@ impl Repository for LayeredRepository {
timeline_id: ZTimelineId, timeline_id: ZTimelineId,
new_state: TimelineSyncState, new_state: TimelineSyncState,
) -> Result<()> { ) -> Result<()> {
debug!(
"set_timeline_state: timeline_id: {}, new_state: {:?}",
timeline_id, new_state
);
let mut timelines_accessor = self.timelines.lock().unwrap(); let mut timelines_accessor = self.timelines.lock().unwrap();
let timeline_to_shutdown = match new_state { match new_state {
TimelineSyncState::Ready(_) => { TimelineSyncState::Ready(_) => {
let reloaded_timeline = let reloaded_timeline =
self.init_local_timeline(timeline_id, &mut timelines_accessor)?; self.init_local_timeline(timeline_id, &mut timelines_accessor)?;
@@ -318,12 +357,9 @@ impl Repository for LayeredRepository {
}, },
), ),
}; };
// NOTE we do not delete local data in case timeline became cloud only, this is performed in detach_timeline
drop(timelines_accessor); drop(timelines_accessor);
if let Some(timeline) = timeline_to_shutdown {
shutdown_timeline(self.tenantid, timeline_id, &timeline)?;
}
Ok(()) Ok(())
} }
@@ -349,30 +385,6 @@ impl Repository for LayeredRepository {
} }
} }
fn shutdown_timeline(
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
timeline: &LayeredTimelineEntry,
) -> Result<(), anyhow::Error> {
match timeline {
LayeredTimelineEntry::Local(timeline) => {
timeline
.upload_relishes
.store(false, atomic::Ordering::Relaxed);
walreceiver::stop_wal_receiver(timeline_id);
trace!("repo shutdown. checkpoint timeline {}", timeline_id);
// Do not reconstruct pages to reduce shutdown time
timeline.checkpoint(CheckpointConfig::Flush)?;
//TODO Wait for walredo process to shutdown too
}
LayeredTimelineEntry::Remote { .. } => warn!(
"Skipping shutdown of a remote timeline {} for tenant {}",
timeline_id, tenant_id
),
}
Ok(())
}
#[derive(Clone)] #[derive(Clone)]
enum LayeredTimelineEntry { enum LayeredTimelineEntry {
Local(Arc<LayeredTimeline>), Local(Arc<LayeredTimeline>),
@@ -489,6 +501,7 @@ impl LayeredRepository {
tenantid, tenantid,
conf, conf,
timelines: Mutex::new(HashMap::new()), timelines: Mutex::new(HashMap::new()),
gc_cs: Mutex::new(()),
walredo_mgr, walredo_mgr,
upload_relishes, upload_relishes,
} }
@@ -505,10 +518,10 @@ impl LayeredRepository {
let _enter = info_span!("saving metadata").entered(); let _enter = info_span!("saving metadata").entered();
let path = metadata_path(conf, timelineid, tenantid); let path = metadata_path(conf, timelineid, tenantid);
// use OpenOptions to ensure file presence is consistent with first_save // use OpenOptions to ensure file presence is consistent with first_save
let mut file = OpenOptions::new() let mut file = VirtualFile::open_with_options(
.write(true) &path,
.create_new(first_save) OpenOptions::new().write(true).create_new(first_save),
.open(&path)?; )?;
let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
@@ -575,7 +588,8 @@ impl LayeredRepository {
let now = Instant::now(); let now = Instant::now();
// grab mutex to prevent new timelines from being created here. // grab mutex to prevent new timelines from being created here.
// TODO: We will hold it for a long time let _gc_cs = self.gc_cs.lock().unwrap();
let mut timelines = self.timelines.lock().unwrap(); let mut timelines = self.timelines.lock().unwrap();
// Scan all timelines. For each timeline, remember the timeline ID and // Scan all timelines. For each timeline, remember the timeline ID and
@@ -597,7 +611,7 @@ impl LayeredRepository {
} }
} }
//Now collect info about branchpoints // Now collect info about branchpoints
let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new(); let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new();
for &timelineid in &timelineids { for &timelineid in &timelineids {
let timeline = match self.get_or_init_timeline(timelineid, &mut timelines)? { let timeline = match self.get_or_init_timeline(timelineid, &mut timelines)? {
@@ -641,8 +655,10 @@ impl LayeredRepository {
// Ok, we now know all the branch points. // Ok, we now know all the branch points.
// Perform GC for each timeline. // Perform GC for each timeline.
for timelineid in timelineids { for timelineid in timelineids {
if tenant_mgr::shutdown_requested() { if thread_mgr::is_shutdown_requested() {
return Ok(totals); // We were requested to shut down. Stop and return with the progress we
// made.
break;
} }
// We have already loaded all timelines above // We have already loaded all timelines above
@@ -663,6 +679,7 @@ impl LayeredRepository {
} }
if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
drop(timelines);
let branchpoints: Vec<Lsn> = all_branchpoints let branchpoints: Vec<Lsn> = all_branchpoints
.range(( .range((
Included((timelineid, Lsn(0))), Included((timelineid, Lsn(0))),
@@ -678,10 +695,10 @@ impl LayeredRepository {
timeline.checkpoint(CheckpointConfig::Forced)?; timeline.checkpoint(CheckpointConfig::Forced)?;
info!("timeline {} checkpoint_before_gc done", timelineid); info!("timeline {} checkpoint_before_gc done", timelineid);
} }
let result = timeline.gc_timeline(branchpoints, cutoff)?; let result = timeline.gc_timeline(branchpoints, cutoff)?;
totals += result; totals += result;
timelines = self.timelines.lock().unwrap();
} }
} }
@@ -759,8 +776,14 @@ pub struct LayeredTimeline {
/// to avoid deadlock. /// to avoid deadlock.
write_lock: Mutex<()>, write_lock: Mutex<()>,
// Prevent concurrent checkpoints.
// Checkpoints are normally performed by one thread. But checkpoint can also be manually requested by admin
// (that's used in tests), and shutdown also forces a checkpoint. These forced checkpoints run in a different thread
// and could be triggered at the same time as a normal checkpoint.
checkpoint_cs: Mutex<()>,
// Needed to ensure that we can't create a branch at a point that was already garbage collected // Needed to ensure that we can't create a branch at a point that was already garbage collected
latest_gc_cutoff_lsn: AtomicLsn, latest_gc_cutoff_lsn: RwLock<Lsn>,
// It may change across major versions so for simplicity // It may change across major versions so for simplicity
// keep it after running initdb for a timeline. // keep it after running initdb for a timeline.
@@ -804,6 +827,10 @@ impl Timeline for LayeredTimeline {
Ok(()) Ok(())
} }
fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard<Lsn> {
self.latest_gc_cutoff_lsn.read().unwrap()
}
/// Look up given page version. /// Look up given page version.
fn get_page_at_lsn(&self, rel: RelishTag, rel_blknum: BlockNumber, lsn: Lsn) -> Result<Bytes> { fn get_page_at_lsn(&self, rel: RelishTag, rel_blknum: BlockNumber, lsn: Lsn) -> Result<Bytes> {
if !rel.is_blocky() && rel_blknum != 0 { if !rel.is_blocky() && rel_blknum != 0 {
@@ -814,14 +841,6 @@ impl Timeline for LayeredTimeline {
); );
} }
debug_assert!(lsn <= self.get_last_record_lsn()); debug_assert!(lsn <= self.get_last_record_lsn());
let latest_gc_cutoff_lsn = self.latest_gc_cutoff_lsn.load();
// error instead of assert to simplify testing
ensure!(
lsn >= latest_gc_cutoff_lsn,
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
lsn, latest_gc_cutoff_lsn
);
let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum);
if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? {
@@ -992,21 +1011,16 @@ impl Timeline for LayeredTimeline {
/// ///
/// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn. /// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn.
/// ///
fn check_lsn_is_in_scope(&self, lsn: Lsn) -> Result<()> { fn check_lsn_is_in_scope(
let initdb_lsn = self.initdb_lsn; &self,
lsn: Lsn,
latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
) -> Result<()> {
ensure!( ensure!(
lsn >= initdb_lsn, lsn >= **latest_gc_cutoff_lsn,
"LSN {} is earlier than initdb lsn {}",
lsn,
initdb_lsn,
);
let latest_gc_cutoff_lsn = self.latest_gc_cutoff_lsn.load();
ensure!(
lsn >= latest_gc_cutoff_lsn,
"LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
lsn, lsn,
latest_gc_cutoff_lsn, **latest_gc_cutoff_lsn,
); );
Ok(()) Ok(())
} }
@@ -1023,14 +1037,6 @@ impl Timeline for LayeredTimeline {
self.last_record_lsn.load() self.last_record_lsn.load()
} }
fn get_start_lsn(&self) -> Lsn {
self.ancestor_timeline
.as_ref()
.and_then(|ancestor_entry| ancestor_entry.local_or_schedule_download(self.tenantid))
.map(Timeline::get_start_lsn)
.unwrap_or(self.ancestor_lsn)
}
fn get_current_logical_size(&self) -> usize { fn get_current_logical_size(&self) -> usize {
self.current_logical_size.load(atomic::Ordering::Acquire) as usize self.current_logical_size.load(atomic::Ordering::Acquire) as usize
} }
@@ -1118,8 +1124,9 @@ impl LayeredTimeline {
upload_relishes: AtomicBool::new(upload_relishes), upload_relishes: AtomicBool::new(upload_relishes),
write_lock: Mutex::new(()), write_lock: Mutex::new(()),
checkpoint_cs: Mutex::new(()),
latest_gc_cutoff_lsn: AtomicLsn::from(metadata.latest_gc_cutoff_lsn()), latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()),
initdb_lsn: metadata.initdb_lsn(), initdb_lsn: metadata.initdb_lsn(),
} }
} }
@@ -1145,8 +1152,8 @@ impl LayeredTimeline {
// create an ImageLayer struct for each image file. // create an ImageLayer struct for each image file.
if imgfilename.lsn > disk_consistent_lsn { if imgfilename.lsn > disk_consistent_lsn {
warn!( warn!(
"found future image layer {} on timeline {}", "found future image layer {} on timeline {} disk_consistent_lsn is {}",
imgfilename, self.timelineid imgfilename, self.timelineid, disk_consistent_lsn
); );
rename_to_backup(direntry.path())?; rename_to_backup(direntry.path())?;
@@ -1169,8 +1176,8 @@ impl LayeredTimeline {
// before crash. // before crash.
if deltafilename.end_lsn > disk_consistent_lsn + 1 { if deltafilename.end_lsn > disk_consistent_lsn + 1 {
warn!( warn!(
"found future delta layer {} on timeline {}", "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
deltafilename, self.timelineid deltafilename, self.timelineid, disk_consistent_lsn
); );
rename_to_backup(direntry.path())?; rename_to_backup(direntry.path())?;
@@ -1366,7 +1373,7 @@ impl LayeredTimeline {
self.tenantid, self.tenantid,
seg, seg,
lsn, lsn,
lsn, last_record_lsn,
)?; )?;
} else { } else {
return Ok(open_layer); return Ok(open_layer);
@@ -1409,7 +1416,7 @@ impl LayeredTimeline {
self.timelineid, self.timelineid,
self.tenantid, self.tenantid,
start_lsn, start_lsn,
lsn, last_record_lsn,
)?; )?;
} else { } else {
// New relation. // New relation.
@@ -1420,8 +1427,14 @@ impl LayeredTimeline {
lsn lsn
); );
layer = layer = InMemoryLayer::create(
InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, seg, lsn, lsn)?; self.conf,
self.timelineid,
self.tenantid,
seg,
lsn,
last_record_lsn,
)?;
} }
let layer_rc: Arc<InMemoryLayer> = Arc::new(layer); let layer_rc: Arc<InMemoryLayer> = Arc::new(layer);
@@ -1435,7 +1448,10 @@ impl LayeredTimeline {
/// ///
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. /// NOTE: This has nothing to do with checkpoint in PostgreSQL.
fn checkpoint_internal(&self, checkpoint_distance: u64, reconstruct_pages: bool) -> Result<()> { fn checkpoint_internal(&self, checkpoint_distance: u64, reconstruct_pages: bool) -> Result<()> {
let mut write_guard = self.write_lock.lock().unwrap(); // Prevent concurrent checkpoints
let _checkpoint_cs = self.checkpoint_cs.lock().unwrap();
let write_guard = self.write_lock.lock().unwrap();
let mut layers = self.layers.lock().unwrap(); let mut layers = self.layers.lock().unwrap();
// Bump the generation number in the layer map, so that we can distinguish // Bump the generation number in the layer map, so that we can distinguish
@@ -1461,11 +1477,17 @@ impl LayeredTimeline {
let mut disk_consistent_lsn = last_record_lsn; let mut disk_consistent_lsn = last_record_lsn;
let mut layer_paths = Vec::new(); let mut layer_paths = Vec::new();
let mut freeze_end_lsn = Lsn(0);
let mut evicted_layers = Vec::new();
//
// Determine which layers we need to evict and calculate max(latest_lsn)
// among those layers.
//
while let Some((oldest_layer_id, oldest_layer, oldest_generation)) = while let Some((oldest_layer_id, oldest_layer, oldest_generation)) =
layers.peek_oldest_open() layers.peek_oldest_open()
{ {
let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn(); let oldest_lsn = oldest_layer.get_oldest_lsn();
// Does this layer need freezing? // Does this layer need freezing?
// //
// Write out all in-memory layers that contain WAL older than CHECKPOINT_DISTANCE. // Write out all in-memory layers that contain WAL older than CHECKPOINT_DISTANCE.
@@ -1474,28 +1496,60 @@ impl LayeredTimeline {
// when we started. We don't want to process layers inserted after we started, to // when we started. We don't want to process layers inserted after we started, to
// avoid getting into an infinite loop trying to process again entries that we // avoid getting into an infinite loop trying to process again entries that we
// inserted ourselves. // inserted ourselves.
let distance = last_record_lsn.widening_sub(oldest_pending_lsn); //
if distance < 0 // Once we have decided to write out at least one layer, we must also write out
// any other layers that contain WAL older than the end LSN of the layers we have
// already decided to write out. In other words, we must write out all layers
// whose [oldest_lsn, latest_lsn) range overlaps with any of the other layers
// that we are writing out. Otherwise, when we advance 'disk_consistent_lsn', it's
// ambiguous whether those layers are already durable on disk or not. For example,
// imagine that there are two layers in memory that contain page versions in the
// following LSN ranges:
//
// A: 100-150
// B: 110-200
//
// If we flush layer A, we must also flush layer B, because they overlap. If we
// flushed only A, and advanced 'disk_consistent_lsn' to 150, we would break the
// rule that all WAL older than 'disk_consistent_lsn' are durable on disk, because
// B contains some WAL older than 150. On the other hand, if we flushed out A and
// advanced 'disk_consistent_lsn' only up to 110, after crash and restart we would
// delete the first layer because its end LSN is larger than 110. If we changed
// the deletion logic to not delete it, then we would start streaming at 110, and
// process again the WAL records in the range 110-150 that are already in layer A,
// and the WAL processing code does not cope with that. We solve that dilemma by
// insisting that if we write out the first layer, we also write out the second
// layer, and advance disk_consistent_lsn all the way up to 200.
//
let distance = last_record_lsn.widening_sub(oldest_lsn);
if (distance < 0
|| distance < checkpoint_distance.into() || distance < checkpoint_distance.into()
|| oldest_generation == current_generation || oldest_generation == current_generation)
&& oldest_lsn >= freeze_end_lsn
// this layer intersects with evicted layer and so also need to be evicted
{ {
info!( info!(
"the oldest layer is now {} which is {} bytes behind last_record_lsn", "the oldest layer is now {} which is {} bytes behind last_record_lsn",
oldest_layer.filename().display(), oldest_layer.filename().display(),
distance distance
); );
disk_consistent_lsn = oldest_pending_lsn; disk_consistent_lsn = oldest_lsn;
break; break;
} }
let latest_lsn = oldest_layer.get_latest_lsn();
if latest_lsn > freeze_end_lsn {
freeze_end_lsn = latest_lsn; // calculate max of latest_lsn of the layers we're about to evict
}
layers.remove_open(oldest_layer_id);
evicted_layers.push((oldest_layer_id, oldest_layer));
}
drop(layers); // Freeze evicted layers
drop(write_guard); for (_evicted_layer_id, evicted_layer) in evicted_layers.iter() {
// Mark the layer as no longer accepting writes and record the end_lsn.
let mut this_layer_paths = self.evict_layer(oldest_layer_id, reconstruct_pages)?; // This happens in-place, no new layers are created now.
layer_paths.append(&mut this_layer_paths); evicted_layer.freeze(freeze_end_lsn);
layers.insert_historic(evicted_layer.clone());
write_guard = self.write_lock.lock().unwrap();
layers = self.layers.lock().unwrap();
} }
// Call unload() on all frozen layers, to release memory. // Call unload() on all frozen layers, to release memory.
@@ -1508,6 +1562,14 @@ impl LayeredTimeline {
drop(layers); drop(layers);
drop(write_guard); drop(write_guard);
// Create delta/image layers for evicted layers
for (_evicted_layer_id, evicted_layer) in evicted_layers.iter() {
let mut this_layer_paths =
self.evict_layer(evicted_layer.clone(), reconstruct_pages)?;
layer_paths.append(&mut this_layer_paths);
}
// Sync layers
if !layer_paths.is_empty() { if !layer_paths.is_empty() {
// We must fsync the timeline dir to ensure the directory entries for // We must fsync the timeline dir to ensure the directory entries for
// new layer files are durable // new layer files are durable
@@ -1548,7 +1610,7 @@ impl LayeredTimeline {
ondisk_prev_record_lsn, ondisk_prev_record_lsn,
ancestor_timelineid, ancestor_timelineid,
self.ancestor_lsn, self.ancestor_lsn,
self.latest_gc_cutoff_lsn.load(), *self.latest_gc_cutoff_lsn.read().unwrap(),
self.initdb_lsn, self.initdb_lsn,
); );
@@ -1575,52 +1637,29 @@ impl LayeredTimeline {
Ok(()) Ok(())
} }
fn evict_layer(&self, layer_id: LayerId, reconstruct_pages: bool) -> Result<Vec<PathBuf>> { fn evict_layer(
// Mark the layer as no longer accepting writes and record the end_lsn. &self,
// This happens in-place, no new layers are created now. layer: Arc<InMemoryLayer>,
// We call `get_last_record_lsn` again, which may be different from the reconstruct_pages: bool,
// original load, as we may have released the write lock since then. ) -> Result<Vec<PathBuf>> {
let new_historics = layer.write_to_disk(self, reconstruct_pages)?;
let mut write_guard = self.write_lock.lock().unwrap();
let mut layers = self.layers.lock().unwrap();
let mut layer_paths = Vec::new(); let mut layer_paths = Vec::new();
let _write_guard = self.write_lock.lock().unwrap();
let mut layers = self.layers.lock().unwrap();
let global_layer_map = GLOBAL_LAYER_MAP.read().unwrap(); // Finally, replace the frozen in-memory layer with the new on-disk layers
if let Some(oldest_layer) = global_layer_map.get(&layer_id) { layers.remove_historic(layer);
drop(global_layer_map);
oldest_layer.freeze(self.get_last_record_lsn());
// The layer is no longer open, update the layer map to reflect this. // Add the historics to the LayerMap
// We will replace it with on-disk historics below. for delta_layer in new_historics.delta_layers {
layers.remove_open(layer_id); layer_paths.push(delta_layer.path());
layers.insert_historic(oldest_layer.clone()); layers.insert_historic(Arc::new(delta_layer));
}
// Write the now-frozen layer to disk. That could take a while, so release the lock while do it for image_layer in new_historics.image_layers {
drop(layers); layer_paths.push(image_layer.path());
drop(write_guard); layers.insert_historic(Arc::new(image_layer));
let new_historics = oldest_layer.write_to_disk(self, reconstruct_pages)?;
write_guard = self.write_lock.lock().unwrap();
layers = self.layers.lock().unwrap();
// Finally, replace the frozen in-memory layer with the new on-disk layers
layers.remove_historic(oldest_layer);
// Add the historics to the LayerMap
for delta_layer in new_historics.delta_layers {
layer_paths.push(delta_layer.path());
layers.insert_historic(Arc::new(delta_layer));
}
for image_layer in new_historics.image_layers {
layer_paths.push(image_layer.path());
layers.insert_historic(Arc::new(image_layer));
}
} }
drop(layers);
drop(write_guard);
Ok(layer_paths) Ok(layer_paths)
} }
@@ -1649,12 +1688,14 @@ impl LayeredTimeline {
pub fn gc_timeline(&self, retain_lsns: Vec<Lsn>, cutoff: Lsn) -> Result<GcResult> { pub fn gc_timeline(&self, retain_lsns: Vec<Lsn>, cutoff: Lsn) -> Result<GcResult> {
let now = Instant::now(); let now = Instant::now();
let mut result: GcResult = Default::default(); let mut result: GcResult = Default::default();
let disk_consistent_lsn = self.get_disk_consistent_lsn();
let _checkpoint_cs = self.checkpoint_cs.lock().unwrap();
let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered(); let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered();
// We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn.
// See branch_timeline() for details. // See branch_timeline() for details.
self.latest_gc_cutoff_lsn.store(cutoff); *self.latest_gc_cutoff_lsn.write().unwrap() = cutoff;
info!("GC starting"); info!("GC starting");
@@ -1734,7 +1775,12 @@ impl LayeredTimeline {
} }
// 3. Is there a later on-disk layer for this relation? // 3. Is there a later on-disk layer for this relation?
if !l.is_dropped() && !layers.newer_image_layer_exists(l.get_seg_tag(), l.get_end_lsn()) if !l.is_dropped()
&& !layers.newer_image_layer_exists(
l.get_seg_tag(),
l.get_end_lsn(),
disk_consistent_lsn,
)
{ {
info!( info!(
"keeping {} {}-{} because it is the latest layer", "keeping {} {}-{} because it is the latest layer",
@@ -2187,11 +2233,10 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> {
let oldsize = self let oldsize = self
.tl .tl
.get_relish_size(rel, self.tl.get_last_record_lsn())? .get_relish_size(rel, self.tl.get_last_record_lsn())?
.ok_or_else(|| { .with_context(|| {
anyhow!( format!(
"attempted to truncate non-existent relish {} at {}", "attempted to truncate non-existent relish {} at {}",
rel, rel, lsn
lsn
) )
})?; })?;
@@ -2314,8 +2359,5 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
} }
} }
Err(anyhow!( bail!("couldn't find an unused backup number for {:?}", path)
"couldn't find an unused backup number for {:?}",
path
))
} }

View File

@@ -169,7 +169,7 @@ impl DeltaLayerInner {
if let Some((_entry_lsn, entry)) = slice.last() { if let Some((_entry_lsn, entry)) = slice.last() {
Ok(*entry) Ok(*entry)
} else { } else {
Err(anyhow::anyhow!("could not find seg size in delta layer")) bail!("could not find seg size in delta layer")
} }
} }
} }

View File

@@ -173,7 +173,14 @@ impl Layer for ImageLayer {
.as_ref() .as_ref()
.unwrap() .unwrap()
.chapter_reader(BLOCKY_IMAGES_CHAPTER)?; .chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
chapter.read_exact_at(&mut buf, offset)?;
chapter.read_exact_at(&mut buf, offset).with_context(|| {
format!(
"failed to read page from data file {} at offset {}",
self.filename().display(),
offset
)
})?;
buf buf
} }

View File

@@ -39,8 +39,20 @@ pub struct InMemoryLayer {
/// ///
start_lsn: Lsn, start_lsn: Lsn,
/// LSN of the oldest page version stored in this layer ///
oldest_pending_lsn: Lsn, /// LSN of the oldest page version stored in this layer.
///
/// This is different from 'start_lsn' in that we enforce that the 'start_lsn'
/// of a layer always matches the 'end_lsn' of its predecessor, even if there
/// are no page versions until at a later LSN. That way you can detect any
/// missing layer files more easily. 'oldest_lsn' is the first page version
/// actually stored in this layer. In the range between 'start_lsn' and
/// 'oldest_lsn', there are no changes to the segment.
/// 'oldest_lsn' is used to adjust 'disk_consistent_lsn' and that is why it should
/// point to the beginning of WAL record. This is the other difference with 'start_lsn'
/// which points to end of WAL record. This is why 'oldest_lsn' can be smaller than 'start_lsn'.
///
oldest_lsn: Lsn,
/// The above fields never change. The parts that do change are in 'inner', /// The above fields never change. The parts that do change are in 'inner',
/// and protected by mutex. /// and protected by mutex.
@@ -73,6 +85,14 @@ pub struct InMemoryLayerInner {
/// a non-blocky rel, 'seg_sizes' is not used and is always empty. /// a non-blocky rel, 'seg_sizes' is not used and is always empty.
/// ///
seg_sizes: VecMap<Lsn, SegmentBlk>, seg_sizes: VecMap<Lsn, SegmentBlk>,
///
/// LSN of the newest page version stored in this layer.
///
/// The difference between 'end_lsn' and 'latest_lsn' is the same as between
/// 'start_lsn' and 'oldest_lsn'. See comments in 'oldest_lsn'.
///
latest_lsn: Lsn,
} }
impl InMemoryLayerInner { impl InMemoryLayerInner {
@@ -319,8 +339,13 @@ pub struct LayersOnDisk {
impl InMemoryLayer { impl InMemoryLayer {
/// Return the oldest page version that's stored in this layer /// Return the oldest page version that's stored in this layer
pub fn get_oldest_pending_lsn(&self) -> Lsn { pub fn get_oldest_lsn(&self) -> Lsn {
self.oldest_pending_lsn self.oldest_lsn
}
pub fn get_latest_lsn(&self) -> Lsn {
let inner = self.inner.read().unwrap();
inner.latest_lsn
} }
/// ///
@@ -332,7 +357,7 @@ impl InMemoryLayer {
tenantid: ZTenantId, tenantid: ZTenantId,
seg: SegmentTag, seg: SegmentTag,
start_lsn: Lsn, start_lsn: Lsn,
oldest_pending_lsn: Lsn, oldest_lsn: Lsn,
) -> Result<InMemoryLayer> { ) -> Result<InMemoryLayer> {
trace!( trace!(
"initializing new empty InMemoryLayer for writing {} on timeline {} at {}", "initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
@@ -355,13 +380,14 @@ impl InMemoryLayer {
tenantid, tenantid,
seg, seg,
start_lsn, start_lsn,
oldest_pending_lsn, oldest_lsn,
incremental: false, incremental: false,
inner: RwLock::new(InMemoryLayerInner { inner: RwLock::new(InMemoryLayerInner {
end_lsn: None, end_lsn: None,
dropped: false, dropped: false,
page_versions: PageVersions::new(file), page_versions: PageVersions::new(file),
seg_sizes, seg_sizes,
latest_lsn: oldest_lsn,
}), }),
}) })
} }
@@ -398,6 +424,8 @@ impl InMemoryLayer {
let mut inner = self.inner.write().unwrap(); let mut inner = self.inner.write().unwrap();
inner.assert_writeable(); inner.assert_writeable();
assert!(lsn >= inner.latest_lsn);
inner.latest_lsn = lsn;
let old = inner.page_versions.append_or_update_last(blknum, lsn, pv)?; let old = inner.page_versions.append_or_update_last(blknum, lsn, pv)?;
@@ -509,12 +537,11 @@ impl InMemoryLayer {
timelineid: ZTimelineId, timelineid: ZTimelineId,
tenantid: ZTenantId, tenantid: ZTenantId,
start_lsn: Lsn, start_lsn: Lsn,
oldest_pending_lsn: Lsn, oldest_lsn: Lsn,
) -> Result<InMemoryLayer> { ) -> Result<InMemoryLayer> {
let seg = src.get_seg_tag(); let seg = src.get_seg_tag();
assert!(oldest_pending_lsn.is_aligned()); assert!(oldest_lsn.is_aligned());
assert!(oldest_pending_lsn >= start_lsn);
trace!( trace!(
"initializing new InMemoryLayer for writing {} on timeline {} at {}", "initializing new InMemoryLayer for writing {} on timeline {} at {}",
@@ -538,13 +565,14 @@ impl InMemoryLayer {
tenantid, tenantid,
seg, seg,
start_lsn, start_lsn,
oldest_pending_lsn, oldest_lsn,
incremental: true, incremental: true,
inner: RwLock::new(InMemoryLayerInner { inner: RwLock::new(InMemoryLayerInner {
end_lsn: None, end_lsn: None,
dropped: false, dropped: false,
page_versions: PageVersions::new(file), page_versions: PageVersions::new(file),
seg_sizes, seg_sizes,
latest_lsn: oldest_lsn,
}), }),
}) })
} }

View File

@@ -40,7 +40,7 @@ pub struct LayerMap {
/// All the layers keyed by segment tag /// All the layers keyed by segment tag
segs: HashMap<SegmentTag, SegEntry>, segs: HashMap<SegmentTag, SegEntry>,
/// All in-memory layers, ordered by 'oldest_pending_lsn' and generation /// All in-memory layers, ordered by 'oldest_lsn' and generation
/// of each layer. This allows easy access to the in-memory layer that /// of each layer. This allows easy access to the in-memory layer that
/// contains the oldest WAL record. /// contains the oldest WAL record.
open_layers: BinaryHeap<OpenLayerEntry>, open_layers: BinaryHeap<OpenLayerEntry>,
@@ -83,16 +83,16 @@ impl LayerMap {
let layer_id = segentry.update_open(Arc::clone(&layer)); let layer_id = segentry.update_open(Arc::clone(&layer));
let oldest_pending_lsn = layer.get_oldest_pending_lsn(); let oldest_lsn = layer.get_oldest_lsn();
// After a crash and restart, 'oldest_pending_lsn' of the oldest in-memory // After a crash and restart, 'oldest_lsn' of the oldest in-memory
// layer becomes the WAL streaming starting point, so it better not point // layer becomes the WAL streaming starting point, so it better not point
// in the middle of a WAL record. // in the middle of a WAL record.
assert!(oldest_pending_lsn.is_aligned()); assert!(oldest_lsn.is_aligned());
// Also add it to the binary heap // Also add it to the binary heap
let open_layer_entry = OpenLayerEntry { let open_layer_entry = OpenLayerEntry {
oldest_pending_lsn: layer.get_oldest_pending_lsn(), oldest_lsn: layer.get_oldest_lsn(),
layer_id, layer_id,
generation: self.current_generation, generation: self.current_generation,
}; };
@@ -191,9 +191,15 @@ impl LayerMap {
/// ///
/// This is used for garbage collection, to determine if an old layer can /// This is used for garbage collection, to determine if an old layer can
/// be deleted. /// be deleted.
pub fn newer_image_layer_exists(&self, seg: SegmentTag, lsn: Lsn) -> bool { /// We ignore segments newer than disk_consistent_lsn because they will be removed at restart
pub fn newer_image_layer_exists(
&self,
seg: SegmentTag,
lsn: Lsn,
disk_consistent_lsn: Lsn,
) -> bool {
if let Some(segentry) = self.segs.get(&seg) { if let Some(segentry) = self.segs.get(&seg) {
segentry.newer_image_layer_exists(lsn) segentry.newer_image_layer_exists(lsn, disk_consistent_lsn)
} else { } else {
false false
} }
@@ -311,13 +317,18 @@ impl SegEntry {
self.historic.search(lsn) self.historic.search(lsn)
} }
pub fn newer_image_layer_exists(&self, lsn: Lsn) -> bool { pub fn newer_image_layer_exists(&self, lsn: Lsn, disk_consistent_lsn: Lsn) -> bool {
// We only check on-disk layers, because // We only check on-disk layers, because
// in-memory layers are not durable // in-memory layers are not durable
// The end-LSN is exclusive, while disk_consistent_lsn is
// inclusive. For example, if disk_consistent_lsn is 100, it is
// OK for a delta layer to have end LSN 101, but if the end LSN
// is 102, then it might not have been fully flushed to disk
// before crash.
self.historic self.historic
.iter_newer(lsn) .iter_newer(lsn)
.any(|layer| !layer.is_incremental()) .any(|layer| !layer.is_incremental() && layer.get_end_lsn() <= disk_consistent_lsn + 1)
} }
// Set new open layer for a SegEntry. // Set new open layer for a SegEntry.
@@ -341,23 +352,23 @@ impl SegEntry {
} }
/// Entry held in LayerMap::open_layers, with boilerplate comparison routines /// Entry held in LayerMap::open_layers, with boilerplate comparison routines
/// to implement a min-heap ordered by 'oldest_pending_lsn' and 'generation' /// to implement a min-heap ordered by 'oldest_lsn' and 'generation'
/// ///
/// The generation number associated with each entry can be used to distinguish /// The generation number associated with each entry can be used to distinguish
/// recently-added entries (i.e after last call to increment_generation()) from older /// recently-added entries (i.e after last call to increment_generation()) from older
/// entries with the same 'oldest_pending_lsn'. /// entries with the same 'oldest_lsn'.
struct OpenLayerEntry { struct OpenLayerEntry {
oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn() oldest_lsn: Lsn, // copy of layer.get_oldest_lsn()
generation: u64, generation: u64,
layer_id: LayerId, layer_id: LayerId,
} }
impl Ord for OpenLayerEntry { impl Ord for OpenLayerEntry {
fn cmp(&self, other: &Self) -> Ordering { fn cmp(&self, other: &Self) -> Ordering {
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
// to get that. Entries with identical oldest_pending_lsn are ordered by generation // to get that. Entries with identical oldest_lsn are ordered by generation
other other
.oldest_pending_lsn .oldest_lsn
.cmp(&self.oldest_pending_lsn) .cmp(&self.oldest_lsn)
.then_with(|| other.generation.cmp(&self.generation)) .then_with(|| other.generation.cmp(&self.generation))
} }
} }
@@ -426,7 +437,7 @@ mod tests {
conf: &'static PageServerConf, conf: &'static PageServerConf,
segno: u32, segno: u32,
start_lsn: Lsn, start_lsn: Lsn,
oldest_pending_lsn: Lsn, oldest_lsn: Lsn,
) -> Arc<InMemoryLayer> { ) -> Arc<InMemoryLayer> {
Arc::new( Arc::new(
InMemoryLayer::create( InMemoryLayer::create(
@@ -438,7 +449,7 @@ mod tests {
segno, segno,
}, },
start_lsn, start_lsn,
oldest_pending_lsn, oldest_lsn,
) )
.unwrap(), .unwrap(),
) )

View File

@@ -11,6 +11,7 @@ pub mod remote_storage;
pub mod repository; pub mod repository;
pub mod tenant_mgr; pub mod tenant_mgr;
pub mod tenant_threads; pub mod tenant_threads;
pub mod thread_mgr;
pub mod virtual_file; pub mod virtual_file;
pub mod walingest; pub mod walingest;
pub mod walreceiver; pub mod walreceiver;

View File

@@ -10,16 +10,15 @@
// *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url // *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
// //
use anyhow::{anyhow, bail, ensure, Context, Result}; use anyhow::{bail, ensure, Context, Result};
use bytes::{Buf, BufMut, Bytes, BytesMut}; use bytes::{Buf, BufMut, Bytes, BytesMut};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use regex::Regex; use regex::Regex;
use std::io;
use std::net::TcpListener; use std::net::TcpListener;
use std::str; use std::str;
use std::str::FromStr; use std::str::FromStr;
use std::sync::Arc; use std::sync::{Arc, RwLockReadGuard};
use std::thread;
use std::{io, net::TcpStream};
use tracing::*; use tracing::*;
use zenith_metrics::{register_histogram_vec, HistogramVec}; use zenith_metrics::{register_histogram_vec, HistogramVec};
use zenith_utils::auth::{self, JwtAuth}; use zenith_utils::auth::{self, JwtAuth};
@@ -39,6 +38,8 @@ use crate::config::PageServerConf;
use crate::relish::*; use crate::relish::*;
use crate::repository::Timeline; use crate::repository::Timeline;
use crate::tenant_mgr; use crate::tenant_mgr;
use crate::thread_mgr;
use crate::thread_mgr::ThreadKind;
use crate::walreceiver; use crate::walreceiver;
use crate::CheckpointConfig; use crate::CheckpointConfig;
@@ -189,30 +190,61 @@ pub fn thread_main(
listener: TcpListener, listener: TcpListener,
auth_type: AuthType, auth_type: AuthType,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let mut join_handles = Vec::new(); listener.set_nonblocking(true)?;
let basic_rt = tokio::runtime::Builder::new_current_thread()
.enable_io()
.build()?;
while !tenant_mgr::shutdown_requested() { let tokio_listener = {
let (socket, peer_addr) = listener.accept()?; let _guard = basic_rt.enter();
debug!("accepted connection from {}", peer_addr); tokio::net::TcpListener::from_std(listener)
socket.set_nodelay(true).unwrap(); }?;
let local_auth = auth.clone();
let handle = thread::Builder::new() // Wait for a new connection to arrive, or for server shutdown.
.name("serving Page Service thread".into()) while let Some(res) = basic_rt.block_on(async {
.spawn(move || { let shutdown_watcher = thread_mgr::shutdown_watcher();
if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) { tokio::select! {
error!(%err, "page server thread exited with error"); biased;
_ = shutdown_watcher => {
// We were requested to shut down.
None
}
res = tokio_listener.accept() => {
Some(res)
}
}
}) {
match res {
Ok((socket, peer_addr)) => {
// Connection established. Spawn a new thread to handle it.
debug!("accepted connection from {}", peer_addr);
let local_auth = auth.clone();
// PageRequestHandler threads are not associated with any particular
// timeline in the thread manager. In practice most connections will
// only deal with a particular timeline, but we don't know which one
// yet.
if let Err(err) = thread_mgr::spawn(
ThreadKind::PageRequestHandler,
None,
None,
"serving Page Service thread",
move || page_service_conn_main(conf, local_auth, socket, auth_type),
) {
// Thread creation failed. Log the error and continue.
error!("could not spawn page service thread: {:?}", err);
} }
}) }
.unwrap(); Err(err) => {
// accept() failed. Log the error, and loop back to retry on next connection.
join_handles.push(handle); error!("accept() failed: {:?}", err);
}
}
} }
debug!("page_service loop terminated. wait for connections to cancel"); debug!("page_service loop terminated");
for handle in join_handles.into_iter() {
handle.join().unwrap();
}
Ok(()) Ok(())
} }
@@ -220,10 +252,10 @@ pub fn thread_main(
fn page_service_conn_main( fn page_service_conn_main(
conf: &'static PageServerConf, conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>, auth: Option<Arc<JwtAuth>>,
socket: TcpStream, socket: tokio::net::TcpStream,
auth_type: AuthType, auth_type: AuthType,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
// Immediatsely increment the gauge, then create a job to decrement it on thread exit. // Immediately increment the gauge, then create a job to decrement it on thread exit.
// One of the pros of `defer!` is that this will *most probably* // One of the pros of `defer!` is that this will *most probably*
// get called, even in presence of panics. // get called, even in presence of panics.
let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
@@ -232,6 +264,19 @@ fn page_service_conn_main(
gauge.dec(); gauge.dec();
} }
// We use Tokio to accept the connection, but the rest of the code works with a
// regular socket. Convert.
let socket = socket
.into_std()
.context("could not convert tokio::net:TcpStream to std::net::TcpStream")?;
socket
.set_nonblocking(false)
.context("could not put socket to blocking mode")?;
socket
.set_nodelay(true)
.context("could not set TCP_NODELAY")?;
let mut conn_handler = PageServerHandler::new(conf, auth); let mut conn_handler = PageServerHandler::new(conf, auth);
let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?; let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
pgbackend.run(&mut conn_handler) pgbackend.run(&mut conn_handler)
@@ -286,7 +331,7 @@ impl PageServerHandler {
/* switch client to COPYBOTH */ /* switch client to COPYBOTH */
pgb.write_message(&BeMessage::CopyBothResponse)?; pgb.write_message(&BeMessage::CopyBothResponse)?;
while !tenant_mgr::shutdown_requested() { while !thread_mgr::is_shutdown_requested() {
match pgb.read_message() { match pgb.read_message() {
Ok(message) => { Ok(message) => {
if let Some(message) = message { if let Some(message) = message {
@@ -320,7 +365,7 @@ impl PageServerHandler {
let response = response.unwrap_or_else(|e| { let response = response.unwrap_or_else(|e| {
// print the all details to the log with {:#}, but for the client the // print the all details to the log with {:#}, but for the client the
// error message is enough // error message is enough
error!("error reading relation or page version: {:#}", e); error!("error reading relation or page version: {:?}", e);
PagestreamBeMessage::Error(PagestreamErrorResponse { PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(), message: e.to_string(),
}) })
@@ -353,7 +398,12 @@ impl PageServerHandler {
/// In either case, if the page server hasn't received the WAL up to the /// In either case, if the page server hasn't received the WAL up to the
/// requested LSN yet, we will wait for it to arrive. The return value is /// requested LSN yet, we will wait for it to arrive. The return value is
/// the LSN that should be used to look up the page versions. /// the LSN that should be used to look up the page versions.
fn wait_or_get_last_lsn(timeline: &dyn Timeline, lsn: Lsn, latest: bool) -> Result<Lsn> { fn wait_or_get_last_lsn(
timeline: &dyn Timeline,
mut lsn: Lsn,
latest: bool,
latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
) -> Result<Lsn> {
if latest { if latest {
// Latest page version was requested. If LSN is given, it is a hint // Latest page version was requested. If LSN is given, it is a hint
// to the page server that there have been no modifications to the // to the page server that there have been no modifications to the
@@ -374,22 +424,26 @@ impl PageServerHandler {
// walsender completes the authentication and starts streaming the // walsender completes the authentication and starts streaming the
// WAL. // WAL.
if lsn <= last_record_lsn { if lsn <= last_record_lsn {
Ok(last_record_lsn) lsn = last_record_lsn;
} else { } else {
timeline.wait_lsn(lsn)?; timeline.wait_lsn(lsn)?;
// Since we waited for 'lsn' to arrive, that is now the last // Since we waited for 'lsn' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the // record LSN. (Or close enough for our purposes; the
// last-record LSN can advance immediately after we return // last-record LSN can advance immediately after we return
// anyway) // anyway)
Ok(lsn)
} }
} else { } else {
if lsn == Lsn(0) { if lsn == Lsn(0) {
bail!("invalid LSN(0) in request"); bail!("invalid LSN(0) in request");
} }
timeline.wait_lsn(lsn)?; timeline.wait_lsn(lsn)?;
Ok(lsn)
} }
ensure!(
lsn >= **latest_gc_cutoff_lsn,
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
lsn, **latest_gc_cutoff_lsn
);
Ok(lsn)
} }
fn handle_get_rel_exists_request( fn handle_get_rel_exists_request(
@@ -400,7 +454,8 @@ impl PageServerHandler {
let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
let tag = RelishTag::Relation(req.rel); let tag = RelishTag::Relation(req.rel);
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
let exists = timeline.get_rel_exists(tag, lsn)?; let exists = timeline.get_rel_exists(tag, lsn)?;
@@ -416,7 +471,8 @@ impl PageServerHandler {
) -> Result<PagestreamBeMessage> { ) -> Result<PagestreamBeMessage> {
let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
let tag = RelishTag::Relation(req.rel); let tag = RelishTag::Relation(req.rel);
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
let n_blocks = timeline.get_relish_size(tag, lsn)?; let n_blocks = timeline.get_relish_size(tag, lsn)?;
@@ -437,8 +493,16 @@ impl PageServerHandler {
let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
.entered(); .entered();
let tag = RelishTag::Relation(req.rel); let tag = RelishTag::Relation(req.rel);
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
/*
// Add a 1s delay to some requests. The delayed causes the requests to
// hit the race condition from github issue #1047 more easily.
use rand::Rng;
if rand::thread_rng().gen::<u8>() < 5 {
std::thread::sleep(std::time::Duration::from_millis(1000));
}
*/
let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?; let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?;
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -459,9 +523,10 @@ impl PageServerHandler {
// check that the timeline exists // check that the timeline exists
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid) let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
.context("Cannot handle basebackup request for a remote timeline")?; .context("Cannot handle basebackup request for a remote timeline")?;
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
if let Some(lsn) = lsn { if let Some(lsn) = lsn {
timeline timeline
.check_lsn_is_in_scope(lsn) .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
.context("invalid basebackup lsn")?; .context("invalid basebackup lsn")?;
} }
@@ -579,7 +644,7 @@ impl postgres_backend::Handler for PageServerHandler {
let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) ([[:xdigit:]]+) (.*)$").unwrap(); let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) ([[:xdigit:]]+) (.*)$").unwrap();
let caps = re let caps = re
.captures(query_string) .captures(query_string)
.ok_or_else(|| anyhow!("invalid callmemaybe: '{}'", query_string))?; .with_context(|| format!("invalid callmemaybe: '{}'", query_string))?;
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
@@ -594,22 +659,22 @@ impl postgres_backend::Handler for PageServerHandler {
tenant_mgr::get_timeline_for_tenant(tenantid, timelineid) tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
.context("Failed to fetch local timeline for callmemaybe requests")?; .context("Failed to fetch local timeline for callmemaybe requests")?;
walreceiver::launch_wal_receiver(self.conf, timelineid, &connstr, tenantid.to_owned()); walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("branch_create ") { } else if query_string.starts_with("branch_create ") {
let err = || anyhow!("invalid branch_create: '{}'", query_string); let err = || format!("invalid branch_create: '{}'", query_string);
// branch_create <tenantid> <branchname> <startpoint> // branch_create <tenantid> <branchname> <startpoint>
// TODO lazy static // TODO lazy static
// TODO: escaping, to allow branch names with spaces // TODO: escaping, to allow branch names with spaces
let re = Regex::new(r"^branch_create ([[:xdigit:]]+) (\S+) ([^\r\n\s;]+)[\r\n\s;]*;?$") let re = Regex::new(r"^branch_create ([[:xdigit:]]+) (\S+) ([^\r\n\s;]+)[\r\n\s;]*;?$")
.unwrap(); .unwrap();
let caps = re.captures(query_string).ok_or_else(err)?; let caps = re.captures(query_string).with_context(err)?;
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
let branchname = caps.get(2).ok_or_else(err)?.as_str().to_owned(); let branchname = caps.get(2).with_context(err)?.as_str().to_owned();
let startpoint_str = caps.get(3).ok_or_else(err)?.as_str().to_owned(); let startpoint_str = caps.get(3).with_context(err)?.as_str().to_owned();
self.check_permission(Some(tenantid))?; self.check_permission(Some(tenantid))?;
@@ -628,7 +693,7 @@ impl postgres_backend::Handler for PageServerHandler {
let re = Regex::new(r"^branch_list ([[:xdigit:]]+)$").unwrap(); let re = Regex::new(r"^branch_list ([[:xdigit:]]+)$").unwrap();
let caps = re let caps = re
.captures(query_string) .captures(query_string)
.ok_or_else(|| anyhow!("invalid branch_list: '{}'", query_string))?; .with_context(|| format!("invalid branch_list: '{}'", query_string))?;
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
@@ -648,11 +713,11 @@ impl postgres_backend::Handler for PageServerHandler {
.write_message_noflush(&BeMessage::DataRow(&[Some(&tenants_buf)]))? .write_message_noflush(&BeMessage::DataRow(&[Some(&tenants_buf)]))?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("tenant_create") { } else if query_string.starts_with("tenant_create") {
let err = || anyhow!("invalid tenant_create: '{}'", query_string); let err = || format!("invalid tenant_create: '{}'", query_string);
// tenant_create <tenantid> // tenant_create <tenantid>
let re = Regex::new(r"^tenant_create ([[:xdigit:]]+)$").unwrap(); let re = Regex::new(r"^tenant_create ([[:xdigit:]]+)$").unwrap();
let caps = re.captures(query_string).ok_or_else(err)?; let caps = re.captures(query_string).with_context(err)?;
self.check_permission(None)?; self.check_permission(None)?;
@@ -683,7 +748,7 @@ impl postgres_backend::Handler for PageServerHandler {
let caps = re let caps = re
.captures(query_string) .captures(query_string)
.ok_or_else(|| anyhow!("invalid do_gc: '{}'", query_string))?; .with_context(|| format!("invalid do_gc: '{}'", query_string))?;
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
@@ -767,7 +832,7 @@ impl postgres_backend::Handler for PageServerHandler {
let caps = re let caps = re
.captures(query_string) .captures(query_string)
.ok_or_else(|| anyhow!("invalid checkpoint command: '{}'", query_string))?; .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?;
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;

View File

@@ -5,7 +5,7 @@
//! There are a few components the storage machinery consists of: //! There are a few components the storage machinery consists of:
//! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations: //! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
//! * [`local_fs`] allows to use local file system as an external storage //! * [`local_fs`] allows to use local file system as an external storage
//! * [`rust_s3`] uses AWS S3 bucket entirely as an external storage //! * [`rust_s3`] uses AWS S3 bucket as an external storage
//! //!
//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync. //! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
//! Synchronization internals are split into submodules //! Synchronization internals are split into submodules
@@ -89,7 +89,6 @@ use std::{
collections::HashMap, collections::HashMap,
ffi, fs, ffi, fs,
path::{Path, PathBuf}, path::{Path, PathBuf},
thread,
}; };
use anyhow::{bail, Context}; use anyhow::{bail, Context};
@@ -125,8 +124,6 @@ pub struct SyncStartupData {
/// To reuse the local file scan logic, the timeline states are returned even if no sync loop get started during init: /// To reuse the local file scan logic, the timeline states are returned even if no sync loop get started during init:
/// in this case, no remote files exist and all local timelines with correct metadata files are considered ready. /// in this case, no remote files exist and all local timelines with correct metadata files are considered ready.
pub initial_timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>, pub initial_timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>,
/// A handle to the sync loop, if it was started from the configuration provided.
pub sync_loop_handle: Option<thread::JoinHandle<anyhow::Result<()>>>,
} }
/// Based on the config, initiates the remote storage connection and starts a separate thread /// Based on the config, initiates the remote storage connection and starts a separate thread
@@ -141,20 +138,27 @@ pub fn start_local_timeline_sync(
match &config.remote_storage_config { match &config.remote_storage_config {
Some(storage_config) => match &storage_config.storage { Some(storage_config) => match &storage_config.storage {
RemoteStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread( RemoteStorageKind::LocalFs(root) => {
config, info!("Using fs root '{}' as a remote storage", root.display());
local_timeline_files, storage_sync::spawn_storage_sync_thread(
LocalFs::new(root.clone(), &config.workdir)?, config,
storage_config.max_concurrent_sync, local_timeline_files,
storage_config.max_sync_errors, LocalFs::new(root.clone(), &config.workdir)?,
), storage_config.max_concurrent_sync,
RemoteStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread( storage_config.max_sync_errors,
config, )
local_timeline_files, },
S3::new(s3_config, &config.workdir)?, RemoteStorageKind::AwsS3(s3_config) => {
storage_config.max_concurrent_sync, info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
storage_config.max_sync_errors, s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
), storage_sync::spawn_storage_sync_thread(
config,
local_timeline_files,
S3::new(s3_config, &config.workdir)?,
storage_config.max_concurrent_sync,
storage_config.max_sync_errors,
)
},
} }
.context("Failed to spawn the storage sync thread"), .context("Failed to spawn the storage sync thread"),
None => { None => {
@@ -176,7 +180,6 @@ pub fn start_local_timeline_sync(
} }
Ok(SyncStartupData { Ok(SyncStartupData {
initial_timeline_states, initial_timeline_states,
sync_loop_handle: None,
}) })
} }
} }
@@ -205,7 +208,7 @@ fn local_tenant_timeline_files(
} }
} }
Err(e) => error!( Err(e) => error!(
"Failed to list tenants dir entry {:?} in directory {}, reason: {:#}", "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
tenants_dir_entry, tenants_dir_entry,
tenants_dir.display(), tenants_dir.display(),
e e
@@ -246,14 +249,14 @@ fn collect_timelines_for_tenant(
); );
} }
Err(e) => error!( Err(e) => error!(
"Failed to process timeline dir contents at '{}', reason: {:#}", "Failed to process timeline dir contents at '{}', reason: {:?}",
timeline_path.display(), timeline_path.display(),
e e
), ),
} }
} }
Err(e) => error!( Err(e) => error!(
"Failed to list timelines for entry tenant {}, reason: {:#}", "Failed to list timelines for entry tenant {}, reason: {:?}",
tenant_id, e tenant_id, e
), ),
} }

View File

@@ -70,8 +70,3 @@ on the timeline download, missing remote branch files are downlaoded.
A branch is a per-tenant entity, yet a current implementaion requires synchronizing a timeline first to get the branch files locally. A branch is a per-tenant entity, yet a current implementaion requires synchronizing a timeline first to get the branch files locally.
Currently, there's no other way to know about the remote branch files, neither the file contents is verified and updated. Currently, there's no other way to know about the remote branch files, neither the file contents is verified and updated.
* no IT tests
Automated S3 testing is lacking currently, due to no convenient way to enable backups during the tests.
After it's fixed, benchmark runs should also be carried out to find bottlenecks.

View File

@@ -73,7 +73,7 @@ impl RemoteStorage for LocalFs {
} }
async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> { async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
Ok(get_all_files(&self.root).await?.into_iter().collect()) get_all_files(&self.root).await
} }
async fn upload( async fn upload(

View File

@@ -1,12 +1,15 @@
//! AWS S3 storage wrapper around `rust_s3` library. //! AWS S3 storage wrapper around `rust_s3` library.
//! Currently does not allow multiple pageservers to use the same bucket concurrently: objects are //!
//! placed in the root of the bucket. //! Respects `prefix_in_bucket` property from [`S3Config`],
//! allowing multiple pageservers to independently work with the same S3 bucket, if
//! their bucket prefixes are both specified and different.
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use anyhow::Context; use anyhow::Context;
use s3::{bucket::Bucket, creds::Credentials, region::Region}; use s3::{bucket::Bucket, creds::Credentials, region::Region};
use tokio::io::{self, AsyncWriteExt}; use tokio::io::{self, AsyncWriteExt};
use tracing::debug;
use crate::{ use crate::{
config::S3Config, config::S3Config,
@@ -23,8 +26,26 @@ impl S3ObjectKey {
&self.0 &self.0
} }
fn download_destination(&self, pageserver_workdir: &Path) -> PathBuf { fn download_destination(
pageserver_workdir.join(self.0.split(S3_FILE_SEPARATOR).collect::<PathBuf>()) &self,
pageserver_workdir: &Path,
prefix_to_strip: Option<&str>,
) -> PathBuf {
let path_without_prefix = match prefix_to_strip {
Some(prefix) => self.0.strip_prefix(prefix).unwrap_or_else(|| {
panic!(
"Could not strip prefix '{}' from S3 object key '{}'",
prefix, self.0
)
}),
None => &self.0,
};
pageserver_workdir.join(
path_without_prefix
.split(S3_FILE_SEPARATOR)
.collect::<PathBuf>(),
)
} }
} }
@@ -32,15 +53,27 @@ impl S3ObjectKey {
pub struct S3 { pub struct S3 {
pageserver_workdir: &'static Path, pageserver_workdir: &'static Path,
bucket: Bucket, bucket: Bucket,
prefix_in_bucket: Option<String>,
} }
impl S3 { impl S3 {
/// Creates the storage, errors if incorrect AWS S3 configuration provided. /// Creates the storage, errors if incorrect AWS S3 configuration provided.
pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> { pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
let region = aws_config debug!(
.bucket_region "Creating s3 remote storage around bucket {}",
.parse::<Region>() aws_config.bucket_name
.context("Failed to parse the s3 region from config")?; );
let region = match aws_config.endpoint.clone() {
Some(endpoint) => Region::Custom {
endpoint,
region: aws_config.bucket_region.clone(),
},
None => aws_config
.bucket_region
.parse::<Region>()
.context("Failed to parse the s3 region from config")?,
};
let credentials = Credentials::new( let credentials = Credentials::new(
aws_config.access_key_id.as_deref(), aws_config.access_key_id.as_deref(),
aws_config.secret_access_key.as_deref(), aws_config.secret_access_key.as_deref(),
@@ -49,6 +82,20 @@ impl S3 {
None, None,
) )
.context("Failed to create the s3 credentials")?; .context("Failed to create the s3 credentials")?;
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
let mut prefix = prefix;
while prefix.starts_with(S3_FILE_SEPARATOR) {
prefix = &prefix[1..]
}
let mut prefix = prefix.to_string();
while prefix.ends_with(S3_FILE_SEPARATOR) {
prefix.pop();
}
prefix
});
Ok(Self { Ok(Self {
bucket: Bucket::new_with_path_style( bucket: Bucket::new_with_path_style(
aws_config.bucket_name.as_str(), aws_config.bucket_name.as_str(),
@@ -57,6 +104,7 @@ impl S3 {
) )
.context("Failed to create the s3 bucket")?, .context("Failed to create the s3 bucket")?,
pageserver_workdir, pageserver_workdir,
prefix_in_bucket,
}) })
} }
} }
@@ -67,7 +115,7 @@ impl RemoteStorage for S3 {
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> { fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?; let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
let mut key = String::new(); let mut key = self.prefix_in_bucket.clone().unwrap_or_default();
for segment in relative_path { for segment in relative_path {
key.push(S3_FILE_SEPARATOR); key.push(S3_FILE_SEPARATOR);
key.push_str(&segment.to_string_lossy()); key.push_str(&segment.to_string_lossy());
@@ -76,13 +124,14 @@ impl RemoteStorage for S3 {
} }
fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> { fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
Ok(storage_path.download_destination(self.pageserver_workdir)) Ok(storage_path
.download_destination(self.pageserver_workdir, self.prefix_in_bucket.as_deref()))
} }
async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> { async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
let list_response = self let list_response = self
.bucket .bucket
.list(String::new(), None) .list(self.prefix_in_bucket.clone().unwrap_or_default(), None)
.await .await
.context("Failed to list s3 objects")?; .context("Failed to list s3 objects")?;
@@ -225,7 +274,7 @@ mod tests {
assert_eq!( assert_eq!(
local_path, local_path,
key.download_destination(&repo_harness.conf.workdir), key.download_destination(&repo_harness.conf.workdir, None),
"Download destination should consist of s3 path joined with the pageserver workdir prefix" "Download destination should consist of s3 path joined with the pageserver workdir prefix"
); );
@@ -239,14 +288,18 @@ mod tests {
let segment_1 = "matching"; let segment_1 = "matching";
let segment_2 = "file"; let segment_2 = "file";
let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2); let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
let storage = dummy_storage(&repo_harness.conf.workdir);
let expected_key = S3ObjectKey(format!( let expected_key = S3ObjectKey(format!(
"{SEPARATOR}{}{SEPARATOR}{}", "{}{SEPARATOR}{}{SEPARATOR}{}",
storage.prefix_in_bucket.as_deref().unwrap_or_default(),
segment_1, segment_1,
segment_2, segment_2,
SEPARATOR = S3_FILE_SEPARATOR, SEPARATOR = S3_FILE_SEPARATOR,
)); ));
let actual_key = dummy_storage(&repo_harness.conf.workdir) let actual_key = storage
.storage_path(local_path) .storage_path(local_path)
.expect("Matching path should map to S3 path normally"); .expect("Matching path should map to S3 path normally");
assert_eq!( assert_eq!(
@@ -308,18 +361,30 @@ mod tests {
let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID); let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?; let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?;
let s3_key = create_s3_key(&relative_timeline_path.join("not a metadata")); let s3_key = create_s3_key(
&relative_timeline_path.join("not a metadata"),
storage.prefix_in_bucket.as_deref(),
);
assert_eq!( assert_eq!(
s3_key.download_destination(&repo_harness.conf.workdir), s3_key.download_destination(
&repo_harness.conf.workdir,
storage.prefix_in_bucket.as_deref()
),
storage storage
.local_path(&s3_key) .local_path(&s3_key)
.expect("For a valid input, valid S3 info should be parsed"), .expect("For a valid input, valid S3 info should be parsed"),
"Should be able to parse metadata out of the correctly named remote delta file" "Should be able to parse metadata out of the correctly named remote delta file"
); );
let s3_key = create_s3_key(&relative_timeline_path.join(METADATA_FILE_NAME)); let s3_key = create_s3_key(
&relative_timeline_path.join(METADATA_FILE_NAME),
storage.prefix_in_bucket.as_deref(),
);
assert_eq!( assert_eq!(
s3_key.download_destination(&repo_harness.conf.workdir), s3_key.download_destination(
&repo_harness.conf.workdir,
storage.prefix_in_bucket.as_deref()
),
storage storage
.local_path(&s3_key) .local_path(&s3_key)
.expect("For a valid input, valid S3 info should be parsed"), .expect("For a valid input, valid S3 info should be parsed"),
@@ -356,18 +421,18 @@ mod tests {
Credentials::anonymous().unwrap(), Credentials::anonymous().unwrap(),
) )
.unwrap(), .unwrap(),
prefix_in_bucket: Some("dummy_prefix/".to_string()),
} }
} }
fn create_s3_key(relative_file_path: &Path) -> S3ObjectKey { fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> S3ObjectKey {
S3ObjectKey( S3ObjectKey(relative_file_path.iter().fold(
relative_file_path prefix.unwrap_or_default().to_string(),
.iter() |mut path_string, segment| {
.fold(String::new(), |mut path_string, segment| { path_string.push(S3_FILE_SEPARATOR);
path_string.push(S3_FILE_SEPARATOR); path_string.push_str(segment.to_str().unwrap());
path_string.push_str(segment.to_str().unwrap()); path_string
path_string },
}), ))
)
} }
} }

View File

@@ -80,16 +80,19 @@ use std::{
num::{NonZeroU32, NonZeroUsize}, num::{NonZeroU32, NonZeroUsize},
path::{Path, PathBuf}, path::{Path, PathBuf},
sync::Arc, sync::Arc,
thread,
}; };
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use futures::stream::{FuturesUnordered, StreamExt}; use futures::stream::{FuturesUnordered, StreamExt};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use tokio::{fs, sync::RwLock};
use tokio::{ use tokio::{
sync::mpsc::{self, UnboundedReceiver}, fs,
time::Instant, runtime::Runtime,
sync::{
mpsc::{self, UnboundedReceiver},
RwLock,
},
time::{Duration, Instant},
}; };
use tracing::*; use tracing::*;
@@ -106,7 +109,7 @@ use super::{RemoteStorage, SyncStartupData, TimelineSyncId};
use crate::{ use crate::{
config::PageServerConf, layered_repository::metadata::TimelineMetadata, config::PageServerConf, layered_repository::metadata::TimelineMetadata,
remote_storage::storage_sync::compression::read_archive_header, repository::TimelineSyncState, remote_storage::storage_sync::compression::read_archive_header, repository::TimelineSyncState,
tenant_mgr::set_timeline_states, tenant_mgr::set_timeline_states, thread_mgr, thread_mgr::ThreadKind,
}; };
use zenith_metrics::{register_histogram_vec, register_int_gauge, HistogramVec, IntGauge}; use zenith_metrics::{register_histogram_vec, register_int_gauge, HistogramVec, IntGauge};
@@ -330,6 +333,10 @@ pub fn schedule_timeline_checkpoint_upload(
/// ///
/// Ensure that the loop is started otherwise the task is never processed. /// Ensure that the loop is started otherwise the task is never processed.
pub fn schedule_timeline_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { pub fn schedule_timeline_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) {
debug!(
"Scheduling timeline download for tenant {}, timeline {}",
tenant_id, timeline_id
);
sync_queue::push(SyncTask::new( sync_queue::push(SyncTask::new(
TimelineSyncId(tenant_id, timeline_id), TimelineSyncId(tenant_id, timeline_id),
0, 0,
@@ -369,7 +376,7 @@ pub(super) fn spawn_storage_sync_thread<
Ok(local_path) => Some(local_path), Ok(local_path) => Some(local_path),
Err(e) => { Err(e) => {
error!( error!(
"Failed to find local path for remote path {:?}: {:#}", "Failed to find local path for remote path {:?}: {:?}",
remote_path, e remote_path, e
); );
None None
@@ -379,9 +386,12 @@ pub(super) fn spawn_storage_sync_thread<
let initial_timeline_states = schedule_first_sync_tasks(&remote_index, local_timeline_files); let initial_timeline_states = schedule_first_sync_tasks(&remote_index, local_timeline_files);
let handle = thread::Builder::new() thread_mgr::spawn(
.name("Remote storage sync thread".to_string()) ThreadKind::StorageSync,
.spawn(move || { None,
None,
"Remote storage sync thread",
move || {
storage_sync_loop( storage_sync_loop(
runtime, runtime,
conf, conf,
@@ -391,19 +401,25 @@ pub(super) fn spawn_storage_sync_thread<
max_concurrent_sync, max_concurrent_sync,
max_sync_errors, max_sync_errors,
) )
}) },
.context("Failed to spawn remote storage sync thread")?; )
.context("Failed to spawn remote storage sync thread")?;
Ok(SyncStartupData { Ok(SyncStartupData {
initial_timeline_states, initial_timeline_states,
sync_loop_handle: Some(handle),
}) })
} }
enum LoopStep {
NewStates(HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>),
Shutdown,
}
#[allow(clippy::too_many_arguments)]
fn storage_sync_loop< fn storage_sync_loop<
P: std::fmt::Debug + Send + Sync + 'static, P: std::fmt::Debug + Send + Sync + 'static,
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static, S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
>( >(
runtime: tokio::runtime::Runtime, runtime: Runtime,
conf: &'static PageServerConf, conf: &'static PageServerConf,
mut receiver: UnboundedReceiver<SyncTask>, mut receiver: UnboundedReceiver<SyncTask>,
index: RemoteTimelineIndex, index: RemoteTimelineIndex,
@@ -412,23 +428,34 @@ fn storage_sync_loop<
max_sync_errors: NonZeroU32, max_sync_errors: NonZeroU32,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let remote_assets = Arc::new((storage, RwLock::new(index))); let remote_assets = Arc::new((storage, RwLock::new(index)));
while !crate::tenant_mgr::shutdown_requested() { loop {
let new_timeline_states = runtime.block_on( let loop_step = runtime.block_on(async {
loop_step( tokio::select! {
conf, new_timeline_states = loop_step(
&mut receiver, conf,
Arc::clone(&remote_assets), &mut receiver,
max_concurrent_sync, Arc::clone(&remote_assets),
max_sync_errors, max_concurrent_sync,
) max_sync_errors,
.instrument(debug_span!("storage_sync_loop_step")), )
); .instrument(debug_span!("storage_sync_loop_step")) => LoopStep::NewStates(new_timeline_states),
// Batch timeline download registration to ensure that the external registration code won't block any running tasks before. _ = thread_mgr::shutdown_watcher() => LoopStep::Shutdown,
set_timeline_states(conf, new_timeline_states); }
debug!("Sync loop step completed"); });
match loop_step {
LoopStep::NewStates(new_timeline_states) => {
// Batch timeline download registration to ensure that the external registration code won't block any running tasks before.
set_timeline_states(conf, new_timeline_states);
debug!("Sync loop step completed");
}
LoopStep::Shutdown => {
debug!("Shutdown requested, stopping");
break;
}
}
} }
debug!("Shutdown requested, stopping");
Ok(()) Ok(())
} }
@@ -482,7 +509,7 @@ async fn loop_step<
Ok(extra_step) => extra_step, Ok(extra_step) => extra_step,
Err(e) => { Err(e) => {
error!( error!(
"Failed to process storage sync task for tenant {}, timeline {}: {:#}", "Failed to process storage sync task for tenant {}, timeline {}: {:?}",
sync_id.0, sync_id.1, e sync_id.0, sync_id.1, e
); );
None None
@@ -539,7 +566,7 @@ async fn process_task<
"Waiting {} seconds before starting the task", "Waiting {} seconds before starting the task",
seconds_to_wait seconds_to_wait
); );
tokio::time::sleep(tokio::time::Duration::from_secs_f64(seconds_to_wait)).await; tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await;
} }
let sync_start = Instant::now(); let sync_start = Instant::now();

View File

@@ -34,7 +34,7 @@ use std::{
sync::Arc, sync::Arc,
}; };
use anyhow::{anyhow, bail, ensure, Context}; use anyhow::{bail, ensure, Context};
use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder}; use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tokio::{ use tokio::{
@@ -211,16 +211,18 @@ pub async fn read_archive_header<A: io::AsyncRead + Send + Sync + Unpin>(
pub fn parse_archive_name(archive_path: &Path) -> anyhow::Result<(Lsn, u64)> { pub fn parse_archive_name(archive_path: &Path) -> anyhow::Result<(Lsn, u64)> {
let archive_name = archive_path let archive_name = archive_path
.file_name() .file_name()
.ok_or_else(|| anyhow!("Archive '{}' has no file name", archive_path.display()))? .with_context(|| format!("Archive '{}' has no file name", archive_path.display()))?
.to_string_lossy(); .to_string_lossy();
let (lsn_str, header_size_str) = let (lsn_str, header_size_str) =
archive_name.rsplit_once(ARCHIVE_EXTENSION).ok_or_else(|| { archive_name
anyhow!( .rsplit_once(ARCHIVE_EXTENSION)
"Archive '{}' has incorrect extension, expected to contain '{}'", .with_context(|| {
archive_path.display(), format!(
ARCHIVE_EXTENSION "Archive '{}' has incorrect extension, expected to contain '{}'",
) archive_path.display(),
})?; ARCHIVE_EXTENSION
)
})?;
let disk_consistent_lsn = Lsn::from_hex(lsn_str).with_context(|| { let disk_consistent_lsn = Lsn::from_hex(lsn_str).with_context(|| {
format!( format!(
"Archive '{}' has an invalid disk consistent lsn in its extension", "Archive '{}' has an invalid disk consistent lsn in its extension",
@@ -374,7 +376,7 @@ async fn write_archive_contents(
} }
let metadata_bytes_written = io::copy(&mut metadata_bytes.as_slice(), &mut archive_input) let metadata_bytes_written = io::copy(&mut metadata_bytes.as_slice(), &mut archive_input)
.await .await
.with_context(|| "Failed to add metadata into the archive")?; .context("Failed to add metadata into the archive")?;
ensure!( ensure!(
header.metadata_file_size == metadata_bytes_written, header.metadata_file_size == metadata_bytes_written,
"Metadata file was written to the archive incompletely", "Metadata file was written to the archive incompletely",

View File

@@ -3,7 +3,7 @@
use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
use anyhow::{anyhow, ensure, Context}; use anyhow::{ensure, Context};
use futures::{stream::FuturesUnordered, StreamExt}; use futures::{stream::FuturesUnordered, StreamExt};
use tokio::{fs, sync::RwLock}; use tokio::{fs, sync::RwLock};
use tracing::{debug, error, trace, warn}; use tracing::{debug, error, trace, warn};
@@ -80,7 +80,7 @@ pub(super) async fn download_timeline<
{ {
Ok(remote_timeline) => Cow::Owned(remote_timeline), Ok(remote_timeline) => Cow::Owned(remote_timeline),
Err(e) => { Err(e) => {
error!("Failed to download full timeline index: {:#}", e); error!("Failed to download full timeline index: {:?}", e);
return match remote_disk_consistent_lsn { return match remote_disk_consistent_lsn {
Some(disk_consistent_lsn) => { Some(disk_consistent_lsn) => {
sync_queue::push(SyncTask::new( sync_queue::push(SyncTask::new(
@@ -112,7 +112,7 @@ pub(super) async fn download_timeline<
if let Err(e) = download_missing_branches(conf, remote_assets.as_ref(), sync_id.0).await { if let Err(e) = download_missing_branches(conf, remote_assets.as_ref(), sync_id.0).await {
error!( error!(
"Failed to download missing branches for sync id {}: {:#}", "Failed to download missing branches for sync id {}: {:?}",
sync_id, e sync_id, e
); );
sync_queue::push(SyncTask::new( sync_queue::push(SyncTask::new(
@@ -150,7 +150,7 @@ pub(super) async fn download_timeline<
Err(e) => { Err(e) => {
let archives_left = archives_total - archives_downloaded; let archives_left = archives_total - archives_downloaded;
error!( error!(
"Failed to download archive {:?} (archives downloaded: {}; archives left: {}) for tenant {} timeline {}, requeueing the download: {:#}", "Failed to download archive {:?} (archives downloaded: {}; archives left: {}) for tenant {} timeline {}, requeueing the download: {:?}",
archive_id, archives_downloaded, archives_left, tenant_id, timeline_id, e archive_id, archives_downloaded, archives_left, tenant_id, timeline_id, e
); );
sync_queue::push(SyncTask::new( sync_queue::push(SyncTask::new(
@@ -189,7 +189,7 @@ async fn try_download_archive<
debug!("Downloading archive {:?}", archive_id); debug!("Downloading archive {:?}", archive_id);
let archive_to_download = remote_timeline let archive_to_download = remote_timeline
.archive_data(archive_id) .archive_data(archive_id)
.ok_or_else(|| anyhow!("Archive {:?} not found in remote storage", archive_id))?; .with_context(|| format!("Archive {:?} not found in remote storage", archive_id))?;
let (archive_header, header_size) = remote_timeline let (archive_header, header_size) = remote_timeline
.restore_header(archive_id) .restore_header(archive_id)
.context("Failed to restore header when downloading an archive")?; .context("Failed to restore header when downloading an archive")?;
@@ -202,7 +202,7 @@ async fn try_download_archive<
archive_to_download.disk_consistent_lsn(), archive_to_download.disk_consistent_lsn(),
local_metadata.disk_consistent_lsn() local_metadata.disk_consistent_lsn()
), ),
Err(e) => warn!("Failed to read local metadata file, assuing it's safe to override its with the download. Read: {:#}", e), Err(e) => warn!("Failed to read local metadata file, assuming it's safe to override its with the download. Read: {:#}", e),
} }
compression::uncompress_file_stream_with_index( compression::uncompress_file_stream_with_index(
conf.timeline_path(&timeline_id, &tenant_id), conf.timeline_path(&timeline_id, &tenant_id),
@@ -307,7 +307,7 @@ async fn download_missing_branches<
while let Some(download_result) = remote_only_branches_downloads.next().await { while let Some(download_result) = remote_only_branches_downloads.next().await {
if let Err(e) = download_result { if let Err(e) = download_result {
branch_downloads_failed = true; branch_downloads_failed = true;
error!("Failed to download a branch file: {:#}", e); error!("Failed to download a branch file: {:?}", e);
} }
} }
ensure!( ensure!(

View File

@@ -9,7 +9,7 @@ use std::{
path::{Path, PathBuf}, path::{Path, PathBuf},
}; };
use anyhow::{anyhow, bail, ensure, Context}; use anyhow::{bail, ensure, Context};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tracing::debug; use tracing::debug;
use zenith_utils::{ use zenith_utils::{
@@ -214,7 +214,7 @@ impl RemoteTimeline {
let archive = self let archive = self
.checkpoint_archives .checkpoint_archives
.get(&archive_id) .get(&archive_id)
.ok_or_else(|| anyhow!("Archive {:?} not found", archive_id))?; .with_context(|| format!("Archive {:?} not found", archive_id))?;
let mut header_files = Vec::with_capacity(archive.files.len()); let mut header_files = Vec::with_capacity(archive.files.len());
for (expected_archive_position, archive_file) in archive.files.iter().enumerate() { for (expected_archive_position, archive_file) in archive.files.iter().enumerate() {
@@ -226,11 +226,10 @@ impl RemoteTimeline {
archive_id, archive_id,
); );
let timeline_file = self.timeline_files.get(archive_file).ok_or_else(|| { let timeline_file = self.timeline_files.get(archive_file).with_context(|| {
anyhow!( format!(
"File with id {:?} not found for archive {:?}", "File with id {:?} not found for archive {:?}",
archive_file, archive_file, archive_id
archive_id
) )
})?; })?;
header_files.push(timeline_file.clone()); header_files.push(timeline_file.clone());
@@ -299,7 +298,7 @@ fn try_parse_index_entry(
})? })?
.iter() .iter()
.next() .next()
.ok_or_else(|| anyhow!("Found no tenant id in path '{}'", path.display()))? .with_context(|| format!("Found no tenant id in path '{}'", path.display()))?
.to_string_lossy() .to_string_lossy()
.parse::<ZTenantId>() .parse::<ZTenantId>()
.with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?; .with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?;
@@ -321,8 +320,8 @@ fn try_parse_index_entry(
let mut segments = timelines_subpath.iter(); let mut segments = timelines_subpath.iter();
let timeline_id = segments let timeline_id = segments
.next() .next()
.ok_or_else(|| { .with_context(|| {
anyhow!( format!(
"{} directory of tenant {} (path '{}') is not an index entry", "{} directory of tenant {} (path '{}') is not an index entry",
TIMELINES_SEGMENT_NAME, TIMELINES_SEGMENT_NAME,
tenant_id, tenant_id,
@@ -345,7 +344,7 @@ fn try_parse_index_entry(
let archive_name = path let archive_name = path
.file_name() .file_name()
.ok_or_else(|| anyhow!("Archive '{}' has no file name", path.display()))? .with_context(|| format!("Archive '{}' has no file name", path.display()))?
.to_string_lossy() .to_string_lossy()
.to_string(); .to_string();

View File

@@ -43,7 +43,7 @@ pub(super) async fn upload_timeline_checkpoint<
debug!("Uploading checkpoint for sync id {}", sync_id); debug!("Uploading checkpoint for sync id {}", sync_id);
if let Err(e) = upload_missing_branches(config, remote_assets.as_ref(), sync_id.0).await { if let Err(e) = upload_missing_branches(config, remote_assets.as_ref(), sync_id.0).await {
error!( error!(
"Failed to upload missing branches for sync id {}: {:#}", "Failed to upload missing branches for sync id {}: {:?}",
sync_id, e sync_id, e
); );
sync_queue::push(SyncTask::new( sync_queue::push(SyncTask::new(
@@ -69,7 +69,7 @@ pub(super) async fn upload_timeline_checkpoint<
match update_index_description(remote_assets.as_ref(), &timeline_dir, sync_id).await { match update_index_description(remote_assets.as_ref(), &timeline_dir, sync_id).await {
Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)), Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)),
Err(e) => { Err(e) => {
error!("Failed to download full timeline index: {:#}", e); error!("Failed to download full timeline index: {:?}", e);
sync_queue::push(SyncTask::new( sync_queue::push(SyncTask::new(
sync_id, sync_id,
retries, retries,
@@ -132,7 +132,7 @@ pub(super) async fn upload_timeline_checkpoint<
} }
Err(e) => { Err(e) => {
error!( error!(
"Failed to upload checkpoint: {:#}, requeueing the upload", "Failed to upload checkpoint: {:?}, requeueing the upload",
e e
); );
sync_queue::push(SyncTask::new( sync_queue::push(SyncTask::new(
@@ -253,7 +253,7 @@ async fn upload_missing_branches<
.await .await
.add_branch_file(tenant_id, local_only_branch.clone()), .add_branch_file(tenant_id, local_only_branch.clone()),
Err(e) => { Err(e) => {
error!("Failed to upload branch file: {:#}", e); error!("Failed to upload branch file: {:?}", e);
branch_uploads_failed = true; branch_uploads_failed = true;
} }
} }

View File

@@ -7,7 +7,7 @@ use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::HashSet; use std::collections::HashSet;
use std::ops::{AddAssign, Deref}; use std::ops::{AddAssign, Deref};
use std::sync::Arc; use std::sync::{Arc, RwLockReadGuard};
use std::time::Duration; use std::time::Duration;
use zenith_utils::lsn::{Lsn, RecordLsn}; use zenith_utils::lsn::{Lsn, RecordLsn};
use zenith_utils::zid::ZTimelineId; use zenith_utils::zid::ZTimelineId;
@@ -19,7 +19,7 @@ pub type BlockNumber = u32;
/// A repository corresponds to one .zenith directory. One repository holds multiple /// A repository corresponds to one .zenith directory. One repository holds multiple
/// timelines, forked off from the same initial call to 'initdb'. /// timelines, forked off from the same initial call to 'initdb'.
pub trait Repository: Send + Sync { pub trait Repository: Send + Sync {
fn shutdown(&self) -> Result<()>; fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
/// Updates timeline based on the new sync state, received from the remote storage synchronization. /// Updates timeline based on the new sync state, received from the remote storage synchronization.
/// See [`crate::remote_storage`] for more details about the synchronization. /// See [`crate::remote_storage`] for more details about the synchronization.
@@ -184,6 +184,9 @@ pub trait Timeline: Send + Sync {
/// ///
fn wait_lsn(&self, lsn: Lsn) -> Result<()>; fn wait_lsn(&self, lsn: Lsn) -> Result<()>;
/// Lock and get timeline's GC cuttof
fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard<Lsn>;
/// Look up given page version. /// Look up given page version.
fn get_page_at_lsn(&self, tag: RelishTag, blknum: BlockNumber, lsn: Lsn) -> Result<Bytes>; fn get_page_at_lsn(&self, tag: RelishTag, blknum: BlockNumber, lsn: Lsn) -> Result<Bytes>;
@@ -217,10 +220,12 @@ pub trait Timeline: Send + Sync {
/// Atomically get both last and prev. /// Atomically get both last and prev.
fn get_last_record_rlsn(&self) -> RecordLsn; fn get_last_record_rlsn(&self) -> RecordLsn;
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
fn get_last_record_lsn(&self) -> Lsn; fn get_last_record_lsn(&self) -> Lsn;
fn get_prev_record_lsn(&self) -> Lsn; fn get_prev_record_lsn(&self) -> Lsn;
fn get_start_lsn(&self) -> Lsn;
fn get_disk_consistent_lsn(&self) -> Lsn; fn get_disk_consistent_lsn(&self) -> Lsn;
/// Mutate the timeline with a [`TimelineWriter`]. /// Mutate the timeline with a [`TimelineWriter`].
@@ -235,7 +240,11 @@ pub trait Timeline: Send + Sync {
/// ///
/// Check that it is valid to request operations with that lsn. /// Check that it is valid to request operations with that lsn.
fn check_lsn_is_in_scope(&self, lsn: Lsn) -> Result<()>; fn check_lsn_is_in_scope(
&self,
lsn: Lsn,
latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
) -> Result<()>;
/// Retrieve current logical size of the timeline /// Retrieve current logical size of the timeline
/// ///
@@ -244,7 +253,7 @@ pub trait Timeline: Send + Sync {
fn get_current_logical_size(&self) -> usize; fn get_current_logical_size(&self) -> usize;
/// Does the same as get_current_logical_size but counted on demand. /// Does the same as get_current_logical_size but counted on demand.
/// Used in tests to ensure thet incremental and non incremental variants match. /// Used in tests to ensure that incremental and non incremental variants match.
fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>; fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
/// An escape hatch to allow "casting" a generic Timeline to LayeredTimeline. /// An escape hatch to allow "casting" a generic Timeline to LayeredTimeline.
@@ -987,7 +996,7 @@ mod tests {
.source() .source()
.unwrap() .unwrap()
.to_string() .to_string()
.contains("is earlier than initdb lsn")); .contains("is earlier than latest GC horizon"));
} }
} }
@@ -1004,12 +1013,11 @@ mod tests {
make_some_layers(&tline, Lsn(0x20))?; make_some_layers(&tline, Lsn(0x20))?;
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
match tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)) { match tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)) {
Ok(_) => panic!("request for page should have failed"), Ok(_) => panic!("request for page should have failed"),
Err(err) => assert!(err Err(err) => assert!(err.to_string().contains("not found at")),
.to_string()
.contains("tried to request a page version that was garbage collected")),
} }
Ok(()) Ok(())
} }

View File

@@ -5,15 +5,16 @@ use crate::branches;
use crate::config::PageServerConf; use crate::config::PageServerConf;
use crate::layered_repository::LayeredRepository; use crate::layered_repository::LayeredRepository;
use crate::repository::{Repository, Timeline, TimelineSyncState}; use crate::repository::{Repository, Timeline, TimelineSyncState};
use crate::tenant_threads; use crate::thread_mgr;
use crate::thread_mgr::ThreadKind;
use crate::walredo::PostgresRedoManager; use crate::walredo::PostgresRedoManager;
use anyhow::{anyhow, bail, Context, Result}; use crate::CheckpointConfig;
use anyhow::{bail, Context, Result};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use log::*; use log::*;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::{hash_map, HashMap}; use std::collections::{hash_map, HashMap};
use std::fmt; use std::fmt;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex, MutexGuard}; use std::sync::{Arc, Mutex, MutexGuard};
use zenith_utils::zid::{ZTenantId, ZTimelineId}; use zenith_utils::zid::{ZTenantId, ZTimelineId};
@@ -23,7 +24,7 @@ lazy_static! {
struct Tenant { struct Tenant {
state: TenantState, state: TenantState,
repo: Option<Arc<dyn Repository>>, repo: Arc<dyn Repository>,
} }
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -56,8 +57,6 @@ fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
TENANTS.lock().unwrap() TENANTS.lock().unwrap()
} }
static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
/// Updates tenants' repositories, changing their timelines state in memory. /// Updates tenants' repositories, changing their timelines state in memory.
pub fn set_timeline_states( pub fn set_timeline_states(
conf: &'static PageServerConf, conf: &'static PageServerConf,
@@ -73,28 +72,8 @@ pub fn set_timeline_states(
let mut m = access_tenants(); let mut m = access_tenants();
for (tenant_id, timeline_states) in timeline_states { for (tenant_id, timeline_states) in timeline_states {
let tenant = m.entry(tenant_id).or_insert_with(|| Tenant { let tenant = m.entry(tenant_id).or_insert_with(|| {
state: TenantState::Idle, // TODO (rodionov) reuse one of the initialisation routines
repo: None,
});
if let Err(e) = put_timelines_into_tenant(conf, tenant, tenant_id, timeline_states) {
error!(
"Failed to update timeline states for tenant {}: {:#}",
tenant_id, e
);
}
}
}
fn put_timelines_into_tenant(
conf: &'static PageServerConf,
tenant: &mut Tenant,
tenant_id: ZTenantId,
timeline_states: HashMap<ZTimelineId, TimelineSyncState>,
) -> anyhow::Result<()> {
let repo = match tenant.repo.as_ref() {
Some(repo) => Arc::clone(repo),
None => {
// Set up a WAL redo manager, for applying WAL records. // Set up a WAL redo manager, for applying WAL records.
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
@@ -105,13 +84,43 @@ fn put_timelines_into_tenant(
tenant_id, tenant_id,
conf.remote_storage_config.is_some(), conf.remote_storage_config.is_some(),
)); ));
tenant.repo = Some(Arc::clone(&repo)); Tenant {
repo state: TenantState::Idle,
repo,
}
});
if let Err(e) = put_timelines_into_tenant(tenant, tenant_id, timeline_states) {
error!(
"Failed to update timeline states for tenant {}: {:?}",
tenant_id, e
);
} }
}; }
}
fn put_timelines_into_tenant(
tenant: &mut Tenant,
tenant_id: ZTenantId,
timeline_states: HashMap<ZTimelineId, TimelineSyncState>,
) -> anyhow::Result<()> {
for (timeline_id, timeline_state) in timeline_states { for (timeline_id, timeline_state) in timeline_states {
repo.set_timeline_state(timeline_id, timeline_state) // If the timeline is being put into any other state than Ready,
// stop any threads operating on it.
//
// FIXME: This is racy. A page service thread could just get
// handle on the Timeline, before we call set_timeline_state()
if !matches!(timeline_state, TimelineSyncState::Ready(_)) {
thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id));
// Should we run a final checkpoint to flush all the data to
// disk? Doesn't seem necessary; all of the states other than
// Ready imply that the data on local disk is corrupt or incomplete,
// and we don't want to flush that to disk.
}
tenant
.repo
.set_timeline_state(timeline_id, timeline_state)
.with_context(|| { .with_context(|| {
format!( format!(
"Failed to update timeline {} state to {:?}", "Failed to update timeline {} state to {:?}",
@@ -123,29 +132,49 @@ fn put_timelines_into_tenant(
Ok(()) Ok(())
} }
// Check this flag in the thread loops to know when to exit ///
pub fn shutdown_requested() -> bool { /// Shut down all tenants. This runs as part of pageserver shutdown.
SHUTDOWN_REQUESTED.load(Ordering::Relaxed) ///
} pub fn shutdown_all_tenants() {
let mut m = access_tenants();
pub fn shutdown_all_tenants() -> Result<()> { let mut tenantids = Vec::new();
SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed); for (tenantid, tenant) in m.iter_mut() {
tenant.state = TenantState::Stopping;
let tenantids = list_tenantids()?; tenantids.push(*tenantid)
for tenantid in &tenantids {
set_tenant_state(*tenantid, TenantState::Stopping)?;
} }
drop(m);
thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiver), None, None);
thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None);
thread_mgr::shutdown_threads(Some(ThreadKind::Checkpointer), None, None);
// Ok, no background threads running anymore. Flush any remaining data in
// memory to disk.
//
// We assume that any incoming connections that might request pages from
// the repository have already been terminated by the caller, so there
// should be no more activity in any of the repositories.
//
// On error, log it but continue with the shutdown for other tenants.
for tenantid in tenantids { for tenantid in tenantids {
// Wait for checkpointer and GC to finish their job
tenant_threads::wait_for_tenant_threads_to_stop(tenantid);
let repo = get_repository_for_tenant(tenantid)?;
debug!("shutdown tenant {}", tenantid); debug!("shutdown tenant {}", tenantid);
repo.shutdown()?; match get_repository_for_tenant(tenantid) {
Ok(repo) => {
if let Err(err) = repo.checkpoint_iteration(CheckpointConfig::Flush) {
error!(
"Could not checkpoint tenant {} during shutdown: {:?}",
tenantid, err
);
}
}
Err(err) => {
error!(
"Could not get repository for tenant {} during shutdown: {:?}",
tenantid, err
);
}
}
} }
Ok(())
} }
pub fn create_repository_for_tenant( pub fn create_repository_for_tenant(
@@ -153,7 +182,7 @@ pub fn create_repository_for_tenant(
tenantid: ZTenantId, tenantid: ZTenantId,
) -> Result<()> { ) -> Result<()> {
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid)); let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
let repo = Some(branches::create_repo(conf, tenantid, wal_redo_manager)?); let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
match access_tenants().entry(tenantid) { match access_tenants().entry(tenantid) {
hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid), hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid),
@@ -172,34 +201,60 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option<TenantState> {
Some(access_tenants().get(&tenantid)?.state) Some(access_tenants().get(&tenantid)?.state)
} }
pub fn set_tenant_state(tenantid: ZTenantId, newstate: TenantState) -> Result<TenantState> { ///
/// Change the state of a tenant to Active and launch its checkpointer and GC
/// threads. If the tenant was already in Active state or Stopping, does nothing.
///
pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Result<()> {
let mut m = access_tenants(); let mut m = access_tenants();
let tenant = m.get_mut(&tenantid); let tenant = m
.get_mut(&tenantid)
.with_context(|| format!("Tenant not found for id {}", tenantid))?;
match tenant { info!("activating tenant {}", tenantid);
Some(tenant) => {
if newstate == TenantState::Idle && tenant.state != TenantState::Active { match tenant.state {
// Only Active tenant can become Idle // If the tenant is already active, nothing to do.
return Ok(tenant.state); TenantState::Active => {}
}
info!("set_tenant_state: {} -> {}", tenant.state, newstate); // If it's Idle, launch the checkpointer and GC threads
tenant.state = newstate; TenantState::Idle => {
Ok(tenant.state) thread_mgr::spawn(
ThreadKind::Checkpointer,
Some(tenantid),
None,
"Checkpointer thread",
move || crate::tenant_threads::checkpoint_loop(tenantid, conf),
)?;
// FIXME: if we fail to launch the GC thread, but already launched the
// checkpointer, we're in a strange state.
thread_mgr::spawn(
ThreadKind::GarbageCollector,
Some(tenantid),
None,
"GC thread",
move || crate::tenant_threads::gc_loop(tenantid, conf),
)?;
tenant.state = TenantState::Active;
}
TenantState::Stopping => {
// don't re-activate it if it's being stopped
} }
None => bail!("Tenant not found for id {}", tenantid),
} }
Ok(())
} }
pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> { pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
let m = access_tenants(); let m = access_tenants();
let tenant = m let tenant = m
.get(&tenantid) .get(&tenantid)
.ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid))?; .with_context(|| format!("Tenant not found for tenant {}", tenantid))?;
match &tenant.repo { Ok(Arc::clone(&tenant.repo))
Some(repo) => Ok(Arc::clone(repo)),
None => anyhow::bail!("Repository for tenant {} is not yet valid", tenantid),
}
} }
pub fn get_timeline_for_tenant( pub fn get_timeline_for_tenant(
@@ -209,17 +264,7 @@ pub fn get_timeline_for_tenant(
get_repository_for_tenant(tenantid)? get_repository_for_tenant(tenantid)?
.get_timeline(timelineid)? .get_timeline(timelineid)?
.local_timeline() .local_timeline()
.ok_or_else(|| anyhow!("cannot fetch timeline {}", timelineid)) .with_context(|| format!("cannot fetch timeline {}", timelineid))
}
fn list_tenantids() -> Result<Vec<ZTenantId>> {
access_tenants()
.iter()
.map(|v| {
let (tenantid, _) = v;
Ok(*tenantid)
})
.collect()
} }
#[derive(Serialize, Deserialize, Clone)] #[derive(Serialize, Deserialize, Clone)]

View File

@@ -5,88 +5,14 @@ use crate::tenant_mgr;
use crate::tenant_mgr::TenantState; use crate::tenant_mgr::TenantState;
use crate::CheckpointConfig; use crate::CheckpointConfig;
use anyhow::Result; use anyhow::Result;
use lazy_static::lazy_static;
use std::collections::HashMap;
use std::sync::Mutex;
use std::thread::JoinHandle;
use std::time::Duration; use std::time::Duration;
use tracing::*; use tracing::*;
use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
use zenith_utils::zid::ZTenantId; use zenith_utils::zid::ZTenantId;
struct TenantHandleEntry {
checkpointer_handle: Option<JoinHandle<()>>,
gc_handle: Option<JoinHandle<()>>,
}
// Preserve handles to wait for thread completion
// at shutdown
lazy_static! {
static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
Mutex::new(HashMap::new());
}
lazy_static! {
static ref TENANT_THREADS_COUNT: IntGaugeVec = register_int_gauge_vec!(
"tenant_threads_count",
"Number of live tenant threads",
&["tenant_thread_type"]
)
.expect("failed to define a metric");
}
// Launch checkpointer and GC for the tenant.
// It's possible that the threads are running already,
// if so, just don't spawn new ones.
pub fn start_tenant_threads(conf: &'static PageServerConf, tenantid: ZTenantId) {
let mut handles = TENANT_HANDLES.lock().unwrap();
let h = handles
.entry(tenantid)
.or_insert_with(|| TenantHandleEntry {
checkpointer_handle: None,
gc_handle: None,
});
if h.checkpointer_handle.is_none() {
h.checkpointer_handle = std::thread::Builder::new()
.name("Checkpointer thread".into())
.spawn(move || {
checkpoint_loop(tenantid, conf).expect("Checkpointer thread died");
})
.ok();
}
if h.gc_handle.is_none() {
h.gc_handle = std::thread::Builder::new()
.name("GC thread".into())
.spawn(move || {
gc_loop(tenantid, conf).expect("GC thread died");
})
.ok();
}
}
pub fn wait_for_tenant_threads_to_stop(tenantid: ZTenantId) {
let mut handles = TENANT_HANDLES.lock().unwrap();
if let Some(h) = handles.get_mut(&tenantid) {
h.checkpointer_handle.take().map(JoinHandle::join);
trace!("checkpointer for tenant {} has stopped", tenantid);
h.gc_handle.take().map(JoinHandle::join);
trace!("gc for tenant {} has stopped", tenantid);
}
handles.remove(&tenantid);
}
/// ///
/// Checkpointer thread's main loop /// Checkpointer thread's main loop
/// ///
fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { pub fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
let gauge = TENANT_THREADS_COUNT.with_label_values(&["checkpointer"]);
gauge.inc();
scopeguard::defer! {
gauge.dec();
}
loop { loop {
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
break; break;
@@ -112,13 +38,7 @@ fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result
/// ///
/// GC thread's main loop /// GC thread's main loop
/// ///
fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { pub fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
let gauge = TENANT_THREADS_COUNT.with_label_values(&["gc"]);
gauge.inc();
scopeguard::defer! {
gauge.dec();
}
loop { loop {
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
break; break;

View File

@@ -0,0 +1,284 @@
//!
//! This module provides centralized handling of threads in the Page Server.
//!
//! We provide a few basic facilities:
//! - A global registry of threads that lists what kind of threads they are, and
//! which tenant or timeline they are working on
//!
//! - The ability to request a thread to shut down.
//!
//!
//! # How it works?
//!
//! There is a global hashmap of all the threads (`THREADS`). Whenever a new
//! thread is spawned, a PageServerThread entry is added there, and when a
//! thread dies, it removes itself from the hashmap. If you want to kill a
//! thread, you can scan the hashmap to find it.
//!
//! # Thread shutdown
//!
//! To kill a thread, we rely on co-operation from the victim. Each thread is
//! expected to periodically call the `is_shutdown_requested()` function, and
//! if it returns true, exit gracefully. In addition to that, when waiting for
//! the network or other long-running operation, you can use
//! `shutdown_watcher()` function to get a Future that will become ready if
//! the current thread has been requested to shut down. You can use that with
//! Tokio select!(), but note that it relies on thread-local storage, so it
//! will only work with the "current-thread" Tokio runtime!
//!
//!
//! TODO: This would be a good place to also handle panics in a somewhat sane way.
//! Depending on what thread panics, we might want to kill the whole server, or
//! only a single tenant or timeline.
//!
use std::cell::RefCell;
use std::collections::HashMap;
use std::panic;
use std::panic::AssertUnwindSafe;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use std::thread;
use std::thread::JoinHandle;
use tokio::sync::watch;
use tracing::{info, warn};
use lazy_static::lazy_static;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
lazy_static! {
/// Each thread that we track is associated with a "thread ID". It's just
/// an increasing number that we assign, not related to any system thread
/// id.
static ref NEXT_THREAD_ID: AtomicU64 = AtomicU64::new(1);
/// Global registry of threads
static ref THREADS: Mutex<HashMap<u64, Arc<PageServerThread>>> = Mutex::new(HashMap::new());
}
// There is a Tokio watch channel for each thread, which can be used to signal the
// thread that it needs to shut down. This thread local variable holds the receiving
// end of the channel. The sender is kept in the global registry, so that anyone
// can send the signal to request thread shutdown.
thread_local!(static SHUTDOWN_RX: RefCell<Option<watch::Receiver<()>>> = RefCell::new(None));
// Each thread holds reference to its own PageServerThread here.
thread_local!(static CURRENT_THREAD: RefCell<Option<Arc<PageServerThread>>> = RefCell::new(None));
///
/// There are many kinds of threads in the system. Some are associated with a particular
/// tenant or timeline, while others are global.
///
/// Note that we don't try to limit how may threads of a certain kind can be running
/// at the same time.
///
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum ThreadKind {
// libpq listener thread. It just accepts connection and spawns a
// PageRequestHandler thread for each connection.
LibpqEndpointListener,
// HTTP endpoint listener.
HttpEndpointListener,
// Thread that handles a single connection. A PageRequestHandler thread
// starts detached from any particular tenant or timeline, but it can be
// associated with one later, after receiving a command from the client.
PageRequestHandler,
// Thread that connects to a safekeeper to fetch WAL for one timeline.
WalReceiver,
// Thread that handles checkpointing of all timelines for a tenant.
Checkpointer,
// Thread that handles GC of a tenant
GarbageCollector,
// Thread for synchronizing pageserver relish data with the remote storage.
// Shared by all tenants.
StorageSync,
}
struct PageServerThread {
_thread_id: u64,
kind: ThreadKind,
/// Tenant and timeline that this thread is associated with.
tenant_id: Option<ZTenantId>,
timeline_id: Option<ZTimelineId>,
name: String,
// To request thread shutdown, set the flag, and send a dummy message to the
// channel to notify it.
shutdown_requested: AtomicBool,
shutdown_tx: watch::Sender<()>,
/// Handle for waiting for the thread to exit. It can be None, if the
/// the thread has already exited.
join_handle: Mutex<Option<JoinHandle<()>>>,
}
/// Launch a new thread
pub fn spawn<F, E>(
kind: ThreadKind,
tenant_id: Option<ZTenantId>,
timeline_id: Option<ZTimelineId>,
name: &str,
f: F,
) -> std::io::Result<()>
where
F: FnOnce() -> Result<(), E> + Send + 'static,
{
let (shutdown_tx, shutdown_rx) = watch::channel(());
let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed);
let thread = PageServerThread {
_thread_id: thread_id,
kind,
tenant_id,
timeline_id,
name: name.to_string(),
shutdown_requested: AtomicBool::new(false),
shutdown_tx,
join_handle: Mutex::new(None),
};
let thread_rc = Arc::new(thread);
let mut jh_guard = thread_rc.join_handle.lock().unwrap();
THREADS
.lock()
.unwrap()
.insert(thread_id, Arc::clone(&thread_rc));
let thread_rc2 = Arc::clone(&thread_rc);
let join_handle = match thread::Builder::new()
.name(name.to_string())
.spawn(move || thread_wrapper(thread_id, thread_rc2, shutdown_rx, f))
{
Ok(handle) => handle,
Err(err) => {
// Could not spawn the thread. Remove the entry
THREADS.lock().unwrap().remove(&thread_id);
return Err(err);
}
};
*jh_guard = Some(join_handle);
drop(jh_guard);
// The thread is now running. Nothing more to do here
Ok(())
}
/// This wrapper function runs in a newly-spawned thread. It initializes the
/// thread-local variables and calls the payload function
fn thread_wrapper<F, E>(
thread_id: u64,
thread: Arc<PageServerThread>,
shutdown_rx: watch::Receiver<()>,
f: F,
) where
F: FnOnce() -> Result<(), E> + Send + 'static,
{
SHUTDOWN_RX.with(|rx| {
*rx.borrow_mut() = Some(shutdown_rx);
});
CURRENT_THREAD.with(|ct| {
*ct.borrow_mut() = Some(thread);
});
// We use AssertUnwindSafe here so that the payload function
// doesn't need to be UnwindSafe. We don't do anything after the
// unwinding that would expose us to unwind-unsafe behavior.
let result = panic::catch_unwind(AssertUnwindSafe(f));
// Remove our entry from the global hashmap.
THREADS.lock().unwrap().remove(&thread_id);
// If the thread payload panic'd, exit with the panic.
if let Err(err) = result {
panic::resume_unwind(err);
}
}
/// Is there a thread running that matches the criteria
/// Signal and wait for threads to shut down.
///
///
/// The arguments are used to select the threads to kill. Any None arguments are
/// ignored. For example, to shut down all WalReceiver threads:
///
/// shutdown_threads(Some(ThreadKind::WalReceiver), None, None)
///
/// Or to shut down all threads for given timeline:
///
/// shutdown_threads(None, Some(timelineid), None)
///
pub fn shutdown_threads(
kind: Option<ThreadKind>,
tenant_id: Option<ZTenantId>,
timeline_id: Option<ZTimelineId>,
) {
let mut victim_threads = Vec::new();
let threads = THREADS.lock().unwrap();
for thread in threads.values() {
if (kind.is_none() || Some(thread.kind) == kind)
&& (tenant_id.is_none() || thread.tenant_id == tenant_id)
&& (timeline_id.is_none() || thread.timeline_id == timeline_id)
{
thread.shutdown_requested.store(true, Ordering::Relaxed);
// FIXME: handle error?
let _ = thread.shutdown_tx.send(());
victim_threads.push(Arc::clone(thread));
}
}
drop(threads);
for thread in victim_threads {
info!("waiting for {} to shut down", thread.name);
if let Some(join_handle) = thread.join_handle.lock().unwrap().take() {
let _ = join_handle.join();
} else {
// The thread had not even fully started yet. Or it was shut down
// concurrently and alrady exited
}
}
}
/// A Future that can be used to check if the current thread has been requested to
/// shut down.
pub async fn shutdown_watcher() {
let _ = SHUTDOWN_RX
.with(|rx| {
rx.borrow()
.as_ref()
.expect("shutdown_requested() called in an unexpected thread")
.clone()
})
.changed()
.await;
}
/// Has the current thread been requested to shut down?
pub fn is_shutdown_requested() -> bool {
CURRENT_THREAD.with(|ct| {
if let Some(ct) = ct.borrow().as_ref() {
ct.shutdown_requested.load(Ordering::Relaxed)
} else {
if !cfg!(test) {
warn!("is_shutdown_requested() called in an unexpected thread");
}
false
}
})
}

View File

@@ -10,15 +10,46 @@
//! This is similar to PostgreSQL's virtual file descriptor facility in //! This is similar to PostgreSQL's virtual file descriptor facility in
//! src/backend/storage/file/fd.c //! src/backend/storage/file/fd.c
//! //!
use lazy_static::lazy_static;
use std::fs::{File, OpenOptions}; use std::fs::{File, OpenOptions};
use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write}; use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
use std::os::unix::fs::FileExt; use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{RwLock, RwLockWriteGuard}; use std::sync::{RwLock, RwLockWriteGuard};
use zenith_metrics::{register_histogram_vec, register_int_gauge_vec, HistogramVec, IntGaugeVec};
use once_cell::sync::OnceCell; use once_cell::sync::OnceCell;
// Metrics collected on disk IO operations
const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
0.000001, // 1 usec
0.00001, // 10 usec
0.0001, // 100 usec
0.001, // 1 msec
0.01, // 10 msec
0.1, // 100 msec
1.0, // 1 sec
];
lazy_static! {
static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!(
"pageserver_io_time",
"Time spent in IO operations",
&["operation", "tenant_id", "timeline_id"],
STORAGE_IO_TIME_BUCKETS.into()
)
.expect("failed to define a metric");
}
lazy_static! {
static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!(
"pageserver_io_size",
"Amount of bytes",
&["operation", "tenant_id", "timeline_id"]
)
.expect("failed to define a metric");
}
/// ///
/// A virtual file descriptor. You can use this just like std::fs::File, but internally /// A virtual file descriptor. You can use this just like std::fs::File, but internally
/// the underlying file is closed if the system is low on file descriptors, /// the underlying file is closed if the system is low on file descriptors,
@@ -51,6 +82,10 @@ pub struct VirtualFile {
/// storing it here. /// storing it here.
pub path: PathBuf, pub path: PathBuf,
open_options: OpenOptions, open_options: OpenOptions,
/// For metrics
tenantid: String,
timelineid: String,
} }
#[derive(PartialEq, Clone, Copy)] #[derive(PartialEq, Clone, Copy)]
@@ -145,7 +180,13 @@ impl OpenFiles {
// old file. // old file.
// //
if let Some(old_file) = slot_guard.file.take() { if let Some(old_file) = slot_guard.file.take() {
drop(old_file); // We do not have information about tenantid/timelineid of evicted file.
// It is possible to store path together with file or use filepath crate,
// but as far as close() is not expected to be fast, it is not so critical to gather
// precise per-tenant statistic here.
STORAGE_IO_TIME
.with_label_values(&["close", "-", "-"])
.observe_closure_duration(|| drop(old_file));
} }
// Prepare the slot for reuse and return it // Prepare the slot for reuse and return it
@@ -185,9 +226,20 @@ impl VirtualFile {
path: &Path, path: &Path,
open_options: &OpenOptions, open_options: &OpenOptions,
) -> Result<VirtualFile, std::io::Error> { ) -> Result<VirtualFile, std::io::Error> {
let parts = path.to_str().unwrap().split('/').collect::<Vec<&str>>();
let tenantid;
let timelineid;
if parts.len() > 5 && parts[parts.len() - 5] == "tenants" {
tenantid = parts[parts.len() - 4].to_string();
timelineid = parts[parts.len() - 2].to_string();
} else {
tenantid = "*".to_string();
timelineid = "*".to_string();
}
let (handle, mut slot_guard) = get_open_files().find_victim_slot(); let (handle, mut slot_guard) = get_open_files().find_victim_slot();
let file = STORAGE_IO_TIME
let file = open_options.open(path)?; .with_label_values(&["open", &tenantid, &timelineid])
.observe_closure_duration(|| open_options.open(path))?;
// Strip all options other than read and write. // Strip all options other than read and write.
// //
@@ -204,6 +256,8 @@ impl VirtualFile {
pos: 0, pos: 0,
path: path.to_path_buf(), path: path.to_path_buf(),
open_options: reopen_options, open_options: reopen_options,
tenantid,
timelineid,
}; };
slot_guard.file.replace(file); slot_guard.file.replace(file);
@@ -213,13 +267,13 @@ impl VirtualFile {
/// Call File::sync_all() on the underlying File. /// Call File::sync_all() on the underlying File.
pub fn sync_all(&self) -> Result<(), Error> { pub fn sync_all(&self) -> Result<(), Error> {
self.with_file(|file| file.sync_all())? self.with_file("fsync", |file| file.sync_all())?
} }
/// Helper function that looks up the underlying File for this VirtualFile, /// Helper function that looks up the underlying File for this VirtualFile,
/// opening it and evicting some other File if necessary. It calls 'func' /// opening it and evicting some other File if necessary. It calls 'func'
/// with the physical File. /// with the physical File.
fn with_file<F, R>(&self, mut func: F) -> Result<R, Error> fn with_file<F, R>(&self, op: &str, mut func: F) -> Result<R, Error>
where where
F: FnMut(&File) -> R, F: FnMut(&File) -> R,
{ {
@@ -242,7 +296,9 @@ impl VirtualFile {
if let Some(file) = &slot_guard.file { if let Some(file) = &slot_guard.file {
// Found a cached file descriptor. // Found a cached file descriptor.
slot.recently_used.store(true, Ordering::Relaxed); slot.recently_used.store(true, Ordering::Relaxed);
return Ok(func(file)); return Ok(STORAGE_IO_TIME
.with_label_values(&[op, &self.tenantid, &self.timelineid])
.observe_closure_duration(|| func(file)));
} }
} }
} }
@@ -267,7 +323,9 @@ impl VirtualFile {
let (handle, mut slot_guard) = open_files.find_victim_slot(); let (handle, mut slot_guard) = open_files.find_victim_slot();
// Open the physical file // Open the physical file
let file = self.open_options.open(&self.path)?; let file = STORAGE_IO_TIME
.with_label_values(&["open", &self.tenantid, &self.timelineid])
.observe_closure_duration(|| self.open_options.open(&self.path))?;
// Perform the requested operation on it // Perform the requested operation on it
// //
@@ -276,7 +334,9 @@ impl VirtualFile {
// library RwLock doesn't allow downgrading without releasing the lock, // library RwLock doesn't allow downgrading without releasing the lock,
// and that doesn't seem worth the trouble. (parking_lot RwLock would // and that doesn't seem worth the trouble. (parking_lot RwLock would
// allow it) // allow it)
let result = func(&file); let result = STORAGE_IO_TIME
.with_label_values(&[op, &self.tenantid, &self.timelineid])
.observe_closure_duration(|| func(&file));
// Store the File in the slot and update the handle in the VirtualFile // Store the File in the slot and update the handle in the VirtualFile
// to point to it. // to point to it.
@@ -299,7 +359,13 @@ impl Drop for VirtualFile {
let mut slot_guard = slot.inner.write().unwrap(); let mut slot_guard = slot.inner.write().unwrap();
if slot_guard.tag == handle.tag { if slot_guard.tag == handle.tag {
slot.recently_used.store(false, Ordering::Relaxed); slot.recently_used.store(false, Ordering::Relaxed);
slot_guard.file.take(); // Unlike files evicted by replacement algorithm, here
// we group close time by tenantid/timelineid.
// At allows to compare number/time of "normal" file closes
// with file eviction.
STORAGE_IO_TIME
.with_label_values(&["close", &self.tenantid, &self.timelineid])
.observe_closure_duration(|| slot_guard.file.take());
} }
} }
} }
@@ -335,7 +401,7 @@ impl Seek for VirtualFile {
self.pos = offset; self.pos = offset;
} }
SeekFrom::End(offset) => { SeekFrom::End(offset) => {
self.pos = self.with_file(|mut file| file.seek(SeekFrom::End(offset)))?? self.pos = self.with_file("seek", |mut file| file.seek(SeekFrom::End(offset)))??
} }
SeekFrom::Current(offset) => { SeekFrom::Current(offset) => {
let pos = self.pos as i128 + offset as i128; let pos = self.pos as i128 + offset as i128;
@@ -357,11 +423,23 @@ impl Seek for VirtualFile {
impl FileExt for VirtualFile { impl FileExt for VirtualFile {
fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> { fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
self.with_file(|file| file.read_at(buf, offset))? let result = self.with_file("read", |file| file.read_at(buf, offset))?;
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["read", &self.tenantid, &self.timelineid])
.add(size as i64);
}
result
} }
fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> { fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
self.with_file(|file| file.write_at(buf, offset))? let result = self.with_file("write", |file| file.write_at(buf, offset))?;
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["write", &self.tenantid, &self.timelineid])
.add(size as i64);
}
result
} }
} }

View File

@@ -7,10 +7,11 @@
use crate::config::PageServerConf; use crate::config::PageServerConf;
use crate::tenant_mgr; use crate::tenant_mgr;
use crate::tenant_mgr::TenantState; use crate::thread_mgr;
use crate::tenant_threads; use crate::thread_mgr::ThreadKind;
use crate::walingest::WalIngest; use crate::walingest::WalIngest;
use anyhow::{bail, Context, Error, Result}; use anyhow::{bail, Context, Error, Result};
use bytes::BytesMut;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use parking_lot::Mutex; use parking_lot::Mutex;
use postgres_ffi::waldecoder::*; use postgres_ffi::waldecoder::*;
@@ -19,32 +20,26 @@ use postgres_types::PgLsn;
use std::cell::Cell; use std::cell::Cell;
use std::collections::HashMap; use std::collections::HashMap;
use std::str::FromStr; use std::str::FromStr;
use std::thread;
use std::thread::JoinHandle;
use std::thread_local; use std::thread_local;
use std::time::SystemTime; use std::time::SystemTime;
use tokio::pin; use tokio::pin;
use tokio::sync::oneshot;
use tokio_postgres::replication::ReplicationStream; use tokio_postgres::replication::ReplicationStream;
use tokio_postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow}; use tokio_postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
use tokio_stream::StreamExt; use tokio_stream::StreamExt;
use tracing::*; use tracing::*;
use zenith_utils::lsn::Lsn; use zenith_utils::lsn::Lsn;
use zenith_utils::pq_proto::ZenithFeedback;
use zenith_utils::zid::ZTenantId; use zenith_utils::zid::ZTenantId;
use zenith_utils::zid::ZTimelineId; use zenith_utils::zid::ZTimelineId;
// //
// We keep one WAL Receiver active per timeline. // We keep one WAL Receiver active per timeline.
// //
struct WalReceiverEntry { struct WalReceiverEntry {
wal_producer_connstr: String, wal_producer_connstr: String,
wal_receiver_handle: Option<JoinHandle<()>>,
wal_receiver_interrupt_sender: Option<oneshot::Sender<()>>,
tenantid: ZTenantId,
} }
lazy_static! { lazy_static! {
static ref WAL_RECEIVERS: Mutex<HashMap<ZTimelineId, WalReceiverEntry>> = static ref WAL_RECEIVERS: Mutex<HashMap<(ZTenantId, ZTimelineId), WalReceiverEntry>> =
Mutex::new(HashMap::new()); Mutex::new(HashMap::new());
} }
@@ -55,97 +50,55 @@ thread_local! {
pub(crate) static IS_WAL_RECEIVER: Cell<bool> = Cell::new(false); pub(crate) static IS_WAL_RECEIVER: Cell<bool> = Cell::new(false);
} }
// Wait for walreceiver to stop fn drop_wal_receiver(tenantid: ZTenantId, timelineid: ZTimelineId) {
// Now it stops when pageserver shutdown is requested.
// In future we can make this more granular and send shutdown signals
// per tenant/timeline to cancel inactive walreceivers.
// TODO deal with blocking pg connections
pub fn stop_wal_receiver(timelineid: ZTimelineId) {
let mut receivers = WAL_RECEIVERS.lock(); let mut receivers = WAL_RECEIVERS.lock();
receivers.remove(&(tenantid, timelineid));
if let Some(r) = receivers.get_mut(&timelineid) {
match r.wal_receiver_interrupt_sender.take() {
Some(s) => {
if s.send(()).is_err() {
warn!("wal receiver interrupt signal already sent");
}
}
None => {
warn!("wal_receiver_interrupt_sender is missing, wal recever shouldn't be running")
}
}
info!("waiting for wal receiver to stop");
let handle = r.wal_receiver_handle.take();
// do not hold the lock while joining the handle (deadlock is possible otherwise)
drop(receivers);
// there is no timeout or try_join option available so in case of a bug this can hang forever
handle.map(JoinHandle::join);
}
}
pub fn drop_wal_receiver(timelineid: ZTimelineId, tenantid: ZTenantId) {
let mut receivers = WAL_RECEIVERS.lock();
receivers.remove(&timelineid);
// Check if it was the last walreceiver of the tenant.
// TODO now we store one WalReceiverEntry per timeline,
// so this iterator looks a bit strange.
for (_timelineid, entry) in receivers.iter() {
if entry.tenantid == tenantid {
return;
}
}
// When last walreceiver of the tenant is gone, change state to Idle
tenant_mgr::set_tenant_state(tenantid, TenantState::Idle).unwrap();
} }
// Launch a new WAL receiver, or tell one that's running about change in connection string // Launch a new WAL receiver, or tell one that's running about change in connection string
pub fn launch_wal_receiver( pub fn launch_wal_receiver(
conf: &'static PageServerConf, conf: &'static PageServerConf,
tenantid: ZTenantId,
timelineid: ZTimelineId, timelineid: ZTimelineId,
wal_producer_connstr: &str, wal_producer_connstr: &str,
tenantid: ZTenantId, ) -> Result<()> {
) {
let mut receivers = WAL_RECEIVERS.lock(); let mut receivers = WAL_RECEIVERS.lock();
match receivers.get_mut(&timelineid) { match receivers.get_mut(&(tenantid, timelineid)) {
Some(receiver) => { Some(receiver) => {
info!("wal receiver already running, updating connection string");
receiver.wal_producer_connstr = wal_producer_connstr.into(); receiver.wal_producer_connstr = wal_producer_connstr.into();
} }
None => { None => {
let (tx, rx) = tokio::sync::oneshot::channel::<()>(); thread_mgr::spawn(
ThreadKind::WalReceiver,
let wal_receiver_handle = thread::Builder::new() Some(tenantid),
.name("WAL receiver thread".into()) Some(timelineid),
.spawn(move || { "WAL receiver thread",
move || {
IS_WAL_RECEIVER.with(|c| c.set(true)); IS_WAL_RECEIVER.with(|c| c.set(true));
thread_main(conf, timelineid, tenantid, rx); thread_main(conf, tenantid, timelineid)
}) },
.unwrap(); )?;
let receiver = WalReceiverEntry { let receiver = WalReceiverEntry {
wal_producer_connstr: wal_producer_connstr.into(), wal_producer_connstr: wal_producer_connstr.into(),
wal_receiver_handle: Some(wal_receiver_handle),
wal_receiver_interrupt_sender: Some(tx),
tenantid,
}; };
receivers.insert(timelineid, receiver); receivers.insert((tenantid, timelineid), receiver);
// Update tenant state and start tenant threads, if they are not running yet. // Update tenant state and start tenant threads, if they are not running yet.
tenant_mgr::set_tenant_state(tenantid, TenantState::Active).unwrap(); tenant_mgr::activate_tenant(conf, tenantid)?;
tenant_threads::start_tenant_threads(conf, tenantid);
} }
}; };
Ok(())
} }
// Look up current WAL producer connection string in the hash table // Look up current WAL producer connection string in the hash table
fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String { fn get_wal_producer_connstr(tenantid: ZTenantId, timelineid: ZTimelineId) -> String {
let receivers = WAL_RECEIVERS.lock(); let receivers = WAL_RECEIVERS.lock();
receivers receivers
.get(&timelineid) .get(&(tenantid, timelineid))
.unwrap() .unwrap()
.wal_producer_connstr .wal_producer_connstr
.clone() .clone()
@@ -156,25 +109,18 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
// //
fn thread_main( fn thread_main(
conf: &'static PageServerConf, conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId, tenantid: ZTenantId,
interrupt_receiver: oneshot::Receiver<()>, timelineid: ZTimelineId,
) { ) -> Result<()> {
let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered(); let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
info!("WAL receiver thread started"); info!("WAL receiver thread started");
// Look up the current WAL producer address // Look up the current WAL producer address
let wal_producer_connstr = get_wal_producer_connstr(timelineid); let wal_producer_connstr = get_wal_producer_connstr(tenantid, timelineid);
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server, // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
// and start streaming WAL from it. // and start streaming WAL from it.
let res = walreceiver_main( let res = walreceiver_main(conf, tenantid, timelineid, &wal_producer_connstr);
conf,
tenantid,
timelineid,
&wal_producer_connstr,
interrupt_receiver,
);
// TODO cleanup info messages // TODO cleanup info messages
if let Err(e) = res { if let Err(e) = res {
@@ -188,7 +134,8 @@ fn thread_main(
// Drop it from list of active WAL_RECEIVERS // Drop it from list of active WAL_RECEIVERS
// so that next callmemaybe request launched a new thread // so that next callmemaybe request launched a new thread
drop_wal_receiver(timelineid, tenantid); drop_wal_receiver(tenantid, timelineid);
Ok(())
} }
fn walreceiver_main( fn walreceiver_main(
@@ -196,7 +143,6 @@ fn walreceiver_main(
tenantid: ZTenantId, tenantid: ZTenantId,
timelineid: ZTimelineId, timelineid: ZTimelineId,
wal_producer_connstr: &str, wal_producer_connstr: &str,
mut interrupt_receiver: oneshot::Receiver<()>,
) -> Result<(), Error> { ) -> Result<(), Error> {
// Connect to the database in replication mode. // Connect to the database in replication mode.
info!("connecting to {:?}", wal_producer_connstr); info!("connecting to {:?}", wal_producer_connstr);
@@ -273,12 +219,15 @@ fn walreceiver_main(
let mut walingest = WalIngest::new(&*timeline, startpoint)?; let mut walingest = WalIngest::new(&*timeline, startpoint)?;
while let Some(replication_message) = runtime.block_on(async { while let Some(replication_message) = runtime.block_on(async {
let shutdown_watcher = thread_mgr::shutdown_watcher();
tokio::select! { tokio::select! {
replication_message = physical_stream.next() => replication_message, // check for shutdown first
_ = &mut interrupt_receiver => { biased;
_ = shutdown_watcher => {
info!("walreceiver interrupted"); info!("walreceiver interrupted");
None None
} }
replication_message = physical_stream.next() => replication_message,
} }
}) { }) {
let replication_message = replication_message?; let replication_message = replication_message?;
@@ -339,7 +288,6 @@ fn walreceiver_main(
}; };
if let Some(last_lsn) = status_update { if let Some(last_lsn) = status_update {
let last_lsn = PgLsn::from(u64::from(last_lsn));
let timeline_synced_disk_consistent_lsn = let timeline_synced_disk_consistent_lsn =
tenant_mgr::get_repository_for_tenant(tenantid)? tenant_mgr::get_repository_for_tenant(tenantid)?
.get_timeline_state(timelineid) .get_timeline_state(timelineid)
@@ -347,18 +295,32 @@ fn walreceiver_main(
.unwrap_or(Lsn(0)); .unwrap_or(Lsn(0));
// The last LSN we processed. It is not guaranteed to survive pageserver crash. // The last LSN we processed. It is not guaranteed to survive pageserver crash.
let write_lsn = last_lsn; let write_lsn = u64::from(last_lsn);
// `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
let flush_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn())); let flush_lsn = u64::from(timeline.get_disk_consistent_lsn());
// The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
// Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
let apply_lsn = PgLsn::from(u64::from(timeline_synced_disk_consistent_lsn)); let apply_lsn = u64::from(timeline_synced_disk_consistent_lsn);
let ts = SystemTime::now(); let ts = SystemTime::now();
const NO_REPLY: u8 = 0;
// Send zenith feedback message.
// Regular standby_status_update fields are put into this message.
let zenith_status_update = ZenithFeedback {
current_timeline_size: timeline.get_current_logical_size() as u64,
ps_writelsn: write_lsn,
ps_flushlsn: flush_lsn,
ps_applylsn: apply_lsn,
ps_replytime: ts,
};
debug!("zenith_status_update {:?}", zenith_status_update);
let mut data = BytesMut::new();
zenith_status_update.serialize(&mut data)?;
runtime.block_on( runtime.block_on(
physical_stream physical_stream
.as_mut() .as_mut()
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY), .zenith_status_update(data.len() as u64, &data),
)?; )?;
} }
} }

2067
poetry.lock generated Normal file

File diff suppressed because one or more lines are too long

View File

@@ -2,7 +2,7 @@
name = "postgres_ffi" name = "postgres_ffi"
version = "0.1.0" version = "0.1.0"
authors = ["Heikki Linnakangas <heikki@zenith.tech>"] authors = ["Heikki Linnakangas <heikki@zenith.tech>"]
edition = "2018" edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@@ -51,6 +51,13 @@ pub type TimeLineID = u32;
pub type TimestampTz = i64; pub type TimestampTz = i64;
pub type XLogSegNo = u64; pub type XLogSegNo = u64;
/// Interval of checkpointing metadata file. We should store metadata file to enforce
/// predicate that checkpoint.nextXid is larger than any XID in WAL.
/// But flushing checkpoint file for each transaction seems to be too expensive,
/// so XID_CHECKPOINT_INTERVAL is used to forward align nextXid and so perform
/// metadata checkpoint only once per XID_CHECKPOINT_INTERVAL transactions.
/// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
/// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG.
const XID_CHECKPOINT_INTERVAL: u32 = 1024; const XID_CHECKPOINT_INTERVAL: u32 = 1024;
#[allow(non_snake_case)] #[allow(non_snake_case)]
@@ -400,9 +407,13 @@ impl CheckPoint {
/// ///
/// Returns 'true' if the XID was updated. /// Returns 'true' if the XID was updated.
pub fn update_next_xid(&mut self, xid: u32) -> bool { pub fn update_next_xid(&mut self, xid: u32) -> bool {
let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1); // nextXid should nw greate than any XID in WAL, so increment provided XID and check for wraparround.
let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
// To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
new_xid =
new_xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
let full_xid = self.nextXid.value; let full_xid = self.nextXid.value;
let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
let old_xid = full_xid as u32; let old_xid = full_xid as u32;
if new_xid.wrapping_sub(old_xid) as i32 > 0 { if new_xid.wrapping_sub(old_xid) as i32 > 0 {
let mut epoch = full_xid >> 32; let mut epoch = full_xid >> 32;
@@ -520,4 +531,34 @@ mod tests {
println!("wal_end={}, tli={}", wal_end, tli); println!("wal_end={}, tli={}", wal_end, tli);
assert_eq!(wal_end, waldump_wal_end); assert_eq!(wal_end, waldump_wal_end);
} }
/// Check the math in update_next_xid
///
/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
/// currently 1024.
#[test]
pub fn test_update_next_xid() {
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
checkpoint.nextXid = FullTransactionId { value: 10 };
assert_eq!(checkpoint.nextXid.value, 10);
// The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
// boundary
checkpoint.update_next_xid(100);
assert_eq!(checkpoint.nextXid.value, 1024);
// No change
checkpoint.update_next_xid(500);
assert_eq!(checkpoint.nextXid.value, 1024);
checkpoint.update_next_xid(1023);
assert_eq!(checkpoint.nextXid.value, 1024);
// The function returns the *next* XID, given the highest XID seen so
// far. So when we pass 1024, the nextXid gets bumped up to the next
// XID_CHECKPOINT_INTERVAL boundary.
checkpoint.update_next_xid(1024);
assert_eq!(checkpoint.nextXid.value, 2048);
}
} }

View File

@@ -38,7 +38,7 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
def yapf(fix_inplace: bool) -> str: def yapf(fix_inplace: bool) -> str:
cmd = "pipenv run yapf --recursive" cmd = "poetry run yapf --recursive"
if fix_inplace: if fix_inplace:
cmd += " --in-place" cmd += " --in-place"
else: else:
@@ -47,7 +47,7 @@ def yapf(fix_inplace: bool) -> str:
def mypy() -> str: def mypy() -> str:
return "pipenv run mypy" return "poetry run mypy"
def get_commit_files() -> List[str]: def get_commit_files() -> List[str]:
@@ -72,7 +72,7 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color:
print("Please inspect the output below and run make fmt to fix automatically.") print("Please inspect the output below and run make fmt to fix automatically.")
if suffix == ".py": if suffix == ".py":
print("If the output is empty, ensure that you've installed Python tooling by\n" print("If the output is empty, ensure that you've installed Python tooling by\n"
"running 'pipenv install --dev' in the current directory (no root needed)") "running './scripts/pysync' in the current directory (no root needed)")
print() print()
print(res.stdout.decode()) print(res.stdout.decode())
exit(1) exit(1)

View File

@@ -2,7 +2,7 @@
name = "proxy" name = "proxy"
version = "0.1.0" version = "0.1.0"
authors = ["Stas Kelvich <stas.kelvich@gmail.com>"] authors = ["Stas Kelvich <stas.kelvich@gmail.com>"]
edition = "2018" edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -13,13 +13,27 @@ lazy_static = "1.4.0"
md5 = "0.7.0" md5 = "0.7.0"
rand = "0.8.3" rand = "0.8.3"
hex = "0.4.3" hex = "0.4.3"
hyper = "0.14"
routerify = "2"
parking_lot = "0.11.2" parking_lot = "0.11.2"
hashbrown = "0.11.2"
serde = "1" serde = "1"
serde_json = "1" serde_json = "1"
tokio = { version = "1.11", features = ["macros"] } tokio = { version = "1.11", features = ["macros"] }
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
tokio-rustls = "0.22.0"
clap = "2.33.0" clap = "2.33.0"
rustls = "0.19.1" rustls = "0.19.1"
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
pin-project-lite = "0.2.7"
futures = "0.3.13"
scopeguard = "1.1.0"
zenith_utils = { path = "../zenith_utils" } zenith_utils = { path = "../zenith_utils" }
zenith_metrics = { path = "../zenith_metrics" }
base64 = "0.13.0"
async-trait = "0.1.52"
[dev-dependencies]
tokio-postgres-rustls = "0.8.0"
rcgen = "0.8.14"

41
proxy/src/auth.rs Normal file
View File

@@ -0,0 +1,41 @@
use crate::db::AuthSecret;
use crate::stream::PqStream;
use bytes::Bytes;
use tokio::io::{AsyncRead, AsyncWrite};
use zenith_utils::pq_proto::BeMessage as Be;
/// Stored secret for authenticating the user via md5 but authenticating
/// to the compute database with a (possibly different) plaintext password.
pub struct PlaintextStoredSecret {
pub salt: [u8; 4],
pub hashed_salted_password: Bytes,
pub compute_db_password: String,
}
/// Sufficient information to auth user and create AuthSecret
#[non_exhaustive]
pub enum StoredSecret {
PlaintextPassword(PlaintextStoredSecret),
// TODO add md5 option?
// TODO add SCRAM option
}
pub async fn authenticate(
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
stored_secret: StoredSecret
) -> anyhow::Result<AuthSecret> {
match stored_secret {
StoredSecret::PlaintextPassword(stored) => {
client.write_message(&Be::AuthenticationMD5Password(&stored.salt)).await?;
let provided = client.read_password_message().await?;
anyhow::ensure!(provided == stored.hashed_salted_password);
Ok(AuthSecret::Password(stored.compute_db_password))
},
}
}
#[async_trait::async_trait]
pub trait SecretStore {
async fn get_stored_secret(&self, creds: &crate::cplane_api::ClientCredentials) -> anyhow::Result<StoredSecret>;
}

90
proxy/src/cancellation.rs Normal file
View File

@@ -0,0 +1,90 @@
use anyhow::{anyhow, Context};
use hashbrown::HashMap;
use lazy_static::lazy_static;
use parking_lot::Mutex;
use std::net::SocketAddr;
use tokio::net::TcpStream;
use tokio_postgres::{CancelToken, NoTls};
use zenith_utils::pq_proto::CancelKeyData;
lazy_static! {
/// Enables serving CancelRequests.
static ref CANCEL_MAP: Mutex<HashMap<CancelKeyData, Option<CancelClosure>>> = Default::default();
}
/// This should've been a [`std::future::Future`], but
/// it's impossible to name a type of an unboxed future
/// (we'd need something like `#![feature(type_alias_impl_trait)]`).
#[derive(Clone)]
pub struct CancelClosure {
socket_addr: SocketAddr,
cancel_token: CancelToken,
}
impl CancelClosure {
pub fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self {
Self {
socket_addr,
cancel_token,
}
}
/// Cancels the query running on user's compute node.
pub async fn try_cancel_query(self) -> anyhow::Result<()> {
let socket = TcpStream::connect(self.socket_addr).await?;
self.cancel_token.cancel_query_raw(socket, NoTls).await?;
Ok(())
}
}
/// Cancel a running query for the corresponding connection.
pub async fn cancel_session(key: CancelKeyData) -> anyhow::Result<()> {
let cancel_closure = CANCEL_MAP
.lock()
.get(&key)
.and_then(|x| x.clone())
.with_context(|| format!("unknown session: {:?}", key))?;
cancel_closure.try_cancel_query().await
}
/// Helper for registering query cancellation tokens.
pub struct Session(CancelKeyData);
impl Session {
/// Store the cancel token for the given session.
pub fn enable_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
CANCEL_MAP.lock().insert(self.0, Some(cancel_closure));
self.0
}
}
/// Run async action within an ephemeral session identified by [`CancelKeyData`].
pub async fn with_session<F, R, V>(f: F) -> anyhow::Result<V>
where
F: FnOnce(Session) -> R,
R: std::future::Future<Output = anyhow::Result<V>>,
{
// HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
// expose it and we don't want to do another roundtrip to query
// for it. The client will be able to notice that this is not the
// actual backend_pid, but backend_pid is not used for anything
// so it doesn't matter.
let key = rand::random();
// The birthday problem is unlikely to happen here, but it's still possible
CANCEL_MAP
.lock()
.try_insert(key, None)
.map_err(|_| anyhow!("session already exists: {:?}", key))?;
// This will guarantee that the session gets dropped
// as soon as the future is finished.
scopeguard::defer! {
CANCEL_MAP.lock().remove(&key);
}
let session = Session(key);
f(session).await
}

7
proxy/src/compute.rs Normal file
View File

@@ -0,0 +1,7 @@
use crate::{cplane_api::ClientCredentials, db::DatabaseConnInfo};
#[async_trait::async_trait]
pub trait ComputeProvider {
async fn get_compute_node(&self, creds: &ClientCredentials) -> anyhow::Result<DatabaseConnInfo>;
}

View File

@@ -1,9 +1,33 @@
use anyhow::{anyhow, bail, Context}; use anyhow::{anyhow, bail, Context};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::net::{SocketAddr, ToSocketAddrs}; use std::net::{SocketAddr, ToSocketAddrs};
use std::collections::HashMap;
use crate::state::ProxyWaiters; use crate::state::ProxyWaiters;
#[derive(Debug, PartialEq, Eq)]
pub struct ClientCredentials {
pub user: String,
pub dbname: String,
}
impl TryFrom<HashMap<String, String>> for ClientCredentials {
type Error = anyhow::Error;
fn try_from(mut value: HashMap<String, String>) -> Result<Self, Self::Error> {
let mut get_param = |key| {
value
.remove(key)
.with_context(|| format!("{} is missing in startup packet", key))
};
let user = get_param("user")?;
let db = get_param("database")?;
Ok(Self { user, dbname: db })
}
}
#[derive(Serialize, Deserialize, Debug, Default)] #[derive(Serialize, Deserialize, Debug, Default)]
pub struct DatabaseInfo { pub struct DatabaseInfo {
pub host: String, pub host: String,
@@ -21,35 +45,6 @@ enum ProxyAuthResponse {
NotReady { ready: bool }, // TODO: get rid of `ready` NotReady { ready: bool }, // TODO: get rid of `ready`
} }
impl DatabaseInfo {
pub fn socket_addr(&self) -> anyhow::Result<SocketAddr> {
let host_port = format!("{}:{}", self.host, self.port);
host_port
.to_socket_addrs()
.with_context(|| format!("cannot resolve {} to SocketAddr", host_port))?
.next()
.ok_or_else(|| anyhow!("cannot resolve at least one SocketAddr"))
}
}
impl From<DatabaseInfo> for tokio_postgres::Config {
fn from(db_info: DatabaseInfo) -> Self {
let mut config = tokio_postgres::Config::new();
config
.host(&db_info.host)
.port(db_info.port)
.dbname(&db_info.dbname)
.user(&db_info.user);
if let Some(password) = db_info.password {
config.password(password);
}
config
}
}
pub struct CPlaneApi<'a> { pub struct CPlaneApi<'a> {
auth_endpoint: &'a str, auth_endpoint: &'a str,
waiters: &'a ProxyWaiters, waiters: &'a ProxyWaiters,

58
proxy/src/db.rs Normal file
View File

@@ -0,0 +1,58 @@
///
/// Utils for connecting with the postgres dataabase.
///
use std::net::{SocketAddr, ToSocketAddrs};
use anyhow::{Context, anyhow};
use crate::cplane_api::ClientCredentials;
pub struct DatabaseConnInfo {
pub host: String,
pub port: u16,
}
pub struct DatabaseAuthInfo {
pub conn_info: DatabaseConnInfo,
pub creds: ClientCredentials,
pub auth_secret: AuthSecret,
}
/// Sufficient information to auth with database
#[non_exhaustive]
#[derive(Debug)]
pub enum AuthSecret {
Password(String),
// TODO add SCRAM option
}
impl From<DatabaseAuthInfo> for tokio_postgres::Config {
fn from(auth_info: DatabaseAuthInfo) -> Self {
let mut config = tokio_postgres::Config::new();
config
.host(&auth_info.conn_info.host)
.port(auth_info.conn_info.port)
.dbname(&auth_info.creds.dbname)
.user(&auth_info.creds.user);
match auth_info.auth_secret {
AuthSecret::Password(password) => {
config.password(password);
}
}
config
}
}
impl DatabaseConnInfo {
pub fn socket_addr(&self) -> anyhow::Result<SocketAddr> {
let host_port = format!("{}:{}", self.host, self.port);
host_port
.to_socket_addrs()
.with_context(|| format!("cannot resolve {} to SocketAddr", host_port))?
.next()
.ok_or_else(|| anyhow!("cannot resolve at least one SocketAddr"))
}
}

26
proxy/src/http.rs Normal file
View File

@@ -0,0 +1,26 @@
use anyhow::anyhow;
use hyper::{Body, Request, Response, StatusCode};
use routerify::RouterBuilder;
use std::net::TcpListener;
use zenith_utils::http::endpoint;
use zenith_utils::http::error::ApiError;
use zenith_utils::http::json::json_response;
async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
Ok(json_response(StatusCode::OK, "")?)
}
fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
let router = endpoint::make_router();
router.get("/v1/status", status_handler)
}
pub async fn thread_main(http_listener: TcpListener) -> anyhow::Result<()> {
let service = || routerify::RouterService::new(make_router().build()?);
hyper::Server::from_tcp(http_listener)?
.serve(service().map_err(|e| anyhow!(e))?)
.await?;
Ok(())
}

View File

@@ -8,16 +8,24 @@
use anyhow::bail; use anyhow::bail;
use clap::{App, Arg}; use clap::{App, Arg};
use state::{ProxyConfig, ProxyState}; use state::{ProxyConfig, ProxyState};
use std::thread;
use zenith_utils::{tcp_listener, GIT_VERSION}; use zenith_utils::{tcp_listener, GIT_VERSION};
mod compute;
mod mock;
mod auth;
mod db;
mod cancellation;
mod cplane_api; mod cplane_api;
mod http;
mod mgmt; mod mgmt;
mod proxy; mod proxy;
mod state; mod state;
mod stream;
mod waiters; mod waiters;
fn main() -> anyhow::Result<()> { #[tokio::main]
async fn main() -> anyhow::Result<()> {
zenith_metrics::set_common_metrics_prefix("zenith_proxy");
let arg_matches = App::new("Zenith proxy/router") let arg_matches = App::new("Zenith proxy/router")
.version(GIT_VERSION) .version(GIT_VERSION)
.arg( .arg(
@@ -36,6 +44,14 @@ fn main() -> anyhow::Result<()> {
.help("listen for management callback connection on ip:port") .help("listen for management callback connection on ip:port")
.default_value("127.0.0.1:7000"), .default_value("127.0.0.1:7000"),
) )
.arg(
Arg::with_name("http")
.short("h")
.long("http")
.takes_value(true)
.help("listen for incoming http connections (metrics, etc) on ip:port")
.default_value("127.0.0.1:7001"),
)
.arg( .arg(
Arg::with_name("uri") Arg::with_name("uri")
.short("u") .short("u")
@@ -49,7 +65,7 @@ fn main() -> anyhow::Result<()> {
.short("a") .short("a")
.long("auth-endpoint") .long("auth-endpoint")
.takes_value(true) .takes_value(true)
.help("redirect unauthenticated users to given uri") .help("API endpoint for authenticating users")
.default_value("http://localhost:3000/authenticate_proxy_request/"), .default_value("http://localhost:3000/authenticate_proxy_request/"),
) )
.arg( .arg(
@@ -82,6 +98,7 @@ fn main() -> anyhow::Result<()> {
let config = ProxyConfig { let config = ProxyConfig {
proxy_address: arg_matches.value_of("proxy").unwrap().parse()?, proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?, mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
http_address: arg_matches.value_of("http").unwrap().parse()?,
redirect_uri: arg_matches.value_of("uri").unwrap().parse()?, redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?,
ssl_config, ssl_config,
@@ -91,26 +108,23 @@ fn main() -> anyhow::Result<()> {
println!("Version: {}", GIT_VERSION); println!("Version: {}", GIT_VERSION);
// Check that we can bind to address before further initialization // Check that we can bind to address before further initialization
println!("Starting http on {}", state.conf.http_address);
let http_listener = tcp_listener::bind(state.conf.http_address)?;
println!("Starting proxy on {}", state.conf.proxy_address); println!("Starting proxy on {}", state.conf.proxy_address);
let pageserver_listener = tcp_listener::bind(state.conf.proxy_address)?; let proxy_listener = tokio::net::TcpListener::bind(state.conf.proxy_address).await?;
println!("Starting mgmt on {}", state.conf.mgmt_address); println!("Starting mgmt on {}", state.conf.mgmt_address);
let mgmt_listener = tcp_listener::bind(state.conf.mgmt_address)?; let mgmt_listener = tcp_listener::bind(state.conf.mgmt_address)?;
let threads = [ let http = tokio::spawn(http::thread_main(http_listener));
// Spawn a thread to listen for connections. It will spawn further threads let proxy = tokio::spawn(proxy::thread_main(state, proxy_listener));
// for each connection. let mgmt = tokio::task::spawn_blocking(move || mgmt::thread_main(state, mgmt_listener));
thread::Builder::new()
.name("Listener thread".into())
.spawn(move || proxy::thread_main(state, pageserver_listener))?,
thread::Builder::new()
.name("Mgmt thread".into())
.spawn(move || mgmt::thread_main(state, mgmt_listener))?,
];
for t in threads { let _ = futures::future::try_join_all([http, proxy, mgmt])
t.join().unwrap()?; .await?
} .into_iter()
.collect::<Result<Vec<()>, _>>()?;
Ok(()) Ok(())
} }

View File

@@ -111,7 +111,7 @@ fn try_process_query(
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
} }
Err(e) => { Err(e) => {
pgb.write_message(&BeMessage::ErrorResponse(e.to_string()))?; pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
} }
} }

32
proxy/src/mock.rs Normal file
View File

@@ -0,0 +1,32 @@
use bytes::Bytes;
use crate::{auth::{PlaintextStoredSecret, SecretStore, StoredSecret}, compute::ComputeProvider, cplane_api::ClientCredentials, db::DatabaseConnInfo};
pub struct MockConsole {
}
#[async_trait::async_trait]
impl SecretStore for MockConsole {
async fn get_stored_secret(&self, creds: &ClientCredentials) -> anyhow::Result<StoredSecret> {
let salt = [0; 4];
match (&creds.user[..], &creds.dbname[..]) {
("postgres", "postgres") => Ok(StoredSecret::PlaintextPassword(PlaintextStoredSecret {
salt,
hashed_salted_password: "md52fff09cd9def51601fc5445943b3a11f\0".into(),
compute_db_password: "postgres".into(),
})),
_ => unimplemented!()
}
}
}
#[async_trait::async_trait]
impl ComputeProvider for MockConsole{
async fn get_compute_node(&self, creds: &ClientCredentials) -> anyhow::Result<DatabaseConnInfo> {
return Ok(DatabaseConnInfo {
host: "127.0.0.1".into(),
port: 5432,
})
}
}

View File

@@ -1,272 +1,185 @@
use crate::cplane_api::{CPlaneApi, DatabaseInfo}; use crate::auth::{self, StoredSecret, SecretStore};
use crate::cancellation::{self, CancelClosure};
use crate::compute::ComputeProvider;
use crate::cplane_api as cplane;
use crate::db::{AuthSecret, DatabaseAuthInfo};
use crate::mock::MockConsole;
use crate::state::SslConfig;
use crate::stream::{PqStream, Stream};
use crate::ProxyState; use crate::ProxyState;
use anyhow::{anyhow, bail}; use anyhow::{bail, Context};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use parking_lot::Mutex; use tokio::io::{AsyncRead, AsyncWrite};
use rand::prelude::StdRng; use tokio::net::TcpStream;
use rand::{Rng, SeedableRng};
use std::cell::Cell;
use std::collections::HashMap;
use std::net::{SocketAddr, TcpStream};
use std::{io, thread};
use tokio_postgres::NoTls; use tokio_postgres::NoTls;
use zenith_utils::postgres_backend::{self, PostgresBackend, ProtoState, Stream}; use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter};
use zenith_utils::pq_proto::{BeMessage as Be, FeMessage as Fe, *}; use zenith_utils::pq_proto::{BeMessage as Be, *};
use zenith_utils::sock_split::{ReadStream, WriteStream};
struct CancelClosure {
socket_addr: SocketAddr,
cancel_token: tokio_postgres::CancelToken,
}
impl CancelClosure {
async fn try_cancel_query(&self) {
if let Ok(socket) = tokio::net::TcpStream::connect(self.socket_addr).await {
// NOTE ignoring the result because:
// 1. This is a best effort attempt, the database doesn't have to listen
// 2. Being opaque about errors here helps avoid leaking info to unauthenticated user
let _ = self.cancel_token.cancel_query_raw(socket, NoTls).await;
}
}
}
lazy_static! { lazy_static! {
// Enables serving CancelRequests static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!(
static ref CANCEL_MAP: Mutex<HashMap<CancelKeyData, CancelClosure>> = Mutex::new(HashMap::new()); new_common_metric_name("num_connections_accepted"),
"Number of TCP client connections accepted."
)
.unwrap();
static ref NUM_CONNECTIONS_CLOSED_COUNTER: IntCounter = register_int_counter!(
new_common_metric_name("num_connections_closed"),
"Number of TCP client connections closed."
)
.unwrap();
static ref NUM_BYTES_PROXIED_COUNTER: IntCounter = register_int_counter!(
new_common_metric_name("num_bytes_proxied"),
"Number of bytes sent/received between any client and backend."
)
.unwrap();
} }
thread_local! { pub async fn thread_main(
// Used to clean up the CANCEL_MAP. Might not be necessary if we use tokio thread pool in main loop.
static THREAD_CANCEL_KEY_DATA: Cell<Option<CancelKeyData>> = Cell::new(None);
}
///
/// Main proxy listener loop.
///
/// Listens for connections, and launches a new handler thread for each.
///
pub fn thread_main(
state: &'static ProxyState, state: &'static ProxyState,
listener: std::net::TcpListener, listener: tokio::net::TcpListener,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
loop { loop {
let (socket, peer_addr) = listener.accept()?; let (socket, peer_addr) = listener.accept().await?;
println!("accepted connection from {}", peer_addr); println!("accepted connection from {}", peer_addr);
socket.set_nodelay(true).unwrap();
// TODO Use a threadpool instead. Maybe use tokio's threadpool by tokio::spawn(log_error(async {
// spawning a future into its runtime. Tokio's JoinError should socket
// allow us to handle cleanup properly even if the future panics. .set_nodelay(true)
thread::Builder::new() .context("failed to set socket option")?;
.name("Proxy thread".into())
.spawn(move || {
if let Err(err) = proxy_conn_main(state, socket) {
println!("error: {}", err);
}
// Clean up CANCEL_MAP. let tls = state.conf.ssl_config.clone();
THREAD_CANCEL_KEY_DATA.with(|cell| { handle_client(socket, tls).await
if let Some(cancel_key_data) = cell.get() { }));
CANCEL_MAP.lock().remove(&cancel_key_data);
};
});
})?;
} }
} }
// TODO: clean up fields async fn log_error<R, F>(future: F) -> F::Output
struct ProxyConnection { where
state: &'static ProxyState, F: std::future::Future<Output = anyhow::Result<R>>,
psql_session_id: String, {
pgb: PostgresBackend, future.await.map_err(|err| {
println!("error: {}", err.to_string());
err
})
} }
pub fn proxy_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> { async fn handle_client(
let conn = ProxyConnection { stream: impl AsyncRead + AsyncWrite + Unpin,
state, tls: Option<SslConfig>,
psql_session_id: hex::encode(rand::random::<[u8; 8]>()), ) -> anyhow::Result<()> {
pgb: PostgresBackend::new( // The `closed` counter will increase when this future is destroyed.
socket, NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
postgres_backend::AuthType::MD5, scopeguard::defer! {
state.conf.ssl_config.clone(), NUM_CONNECTIONS_CLOSED_COUNTER.inc();
false,
)?,
};
let (client, server) = match conn.handle_client()? {
Some(x) => x,
None => return Ok(()),
};
let server = zenith_utils::sock_split::BidiStream::from_tcp(server);
let client = match client {
Stream::Bidirectional(bidi_stream) => bidi_stream,
_ => panic!("invalid stream type"),
};
proxy(client.split(), server.split())
}
impl ProxyConnection {
/// Returns Ok(None) when connection was successfully closed.
fn handle_client(mut self) -> anyhow::Result<Option<(Stream, TcpStream)>> {
let mut authenticate = || {
let (username, dbname) = match self.handle_startup()? {
Some(x) => x,
None => return Ok(None),
};
// Both scenarios here should end up producing database credentials
if username.ends_with("@zenith") {
self.handle_existing_user(&username, &dbname).map(Some)
} else {
self.handle_new_user().map(Some)
}
};
let conn = match authenticate() {
Ok(Some(db_info)) => connect_to_db(db_info),
Ok(None) => return Ok(None),
Err(e) => {
// Report the error to the client
self.pgb.write_message(&Be::ErrorResponse(e.to_string()))?;
bail!("failed to handle client: {:?}", e);
}
};
// We'll get rid of this once migration to async is complete
let (pg_version, db_stream) = {
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?;
let (pg_version, stream, cancel_key_data) = runtime.block_on(conn)?;
self.pgb
.write_message(&BeMessage::BackendKeyData(cancel_key_data))?;
let stream = stream.into_std()?;
stream.set_nonblocking(false)?;
(pg_version, stream)
};
// Let the client send new requests
self.pgb
.write_message_noflush(&BeMessage::ParameterStatus(
BeParameterStatusMessage::ServerVersion(&pg_version),
))?
.write_message(&Be::ReadyForQuery)?;
Ok(Some((self.pgb.into_stream(), db_stream)))
} }
/// Returns Ok(None) when connection was successfully closed. if let Some((stream, creds)) = handshake(stream, tls).await? {
fn handle_startup(&mut self) -> anyhow::Result<Option<(String, String)>> { cancellation::with_session(|session| async {
let have_tls = self.pgb.tls_config.is_some(); connect_client_to_db(stream, creds, session).await
let mut encrypted = false; })
.await?;
}
loop { Ok(())
let msg = match self.pgb.read_message()? { }
Some(Fe::StartupPacket(msg)) => msg,
None => bail!("connection is lost"),
bad => bail!("unexpected message type: {:?}", bad),
};
println!("got message: {:?}", msg);
match msg { /// Handle a connection from one client.
FeStartupPacket::GssEncRequest => { /// For better testing experience, `stream` can be
self.pgb.write_message(&Be::EncryptionResponse(false))?; /// any object satisfying the traits.
} async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
FeStartupPacket::SslRequest => { stream: S,
self.pgb.write_message(&Be::EncryptionResponse(have_tls))?; mut tls: Option<SslConfig>,
if have_tls { ) -> anyhow::Result<Option<(PqStream<Stream<S>>, cplane::ClientCredentials)>> {
self.pgb.start_tls()?; // Client may try upgrading to each protocol only once
encrypted = true; let (mut tried_ssl, mut tried_gss) = (false, false);
let mut stream = PqStream::new(Stream::from_raw(stream));
loop {
let msg = stream.read_startup_packet().await?;
println!("got message: {:?}", msg);
use FeStartupPacket::*;
match msg {
SslRequest => match stream.get_ref() {
Stream::Raw { .. } if !tried_ssl => {
tried_ssl = true;
// We can't perform TLS handshake without a config
let enc = tls.is_some();
stream.write_message(&Be::EncryptionResponse(enc)).await?;
if let Some(tls) = tls.take() {
// Upgrade raw stream into a secure TLS-backed stream.
// NOTE: We've consumed `tls`; this fact will be used later.
stream = PqStream::new(stream.into_inner().upgrade(tls).await?);
} }
} }
FeStartupPacket::StartupMessage { mut params, .. } => { _ => bail!("protocol violation"),
if have_tls && !encrypted { },
bail!("must connect with TLS"); GssEncRequest => match stream.get_ref() {
} Stream::Raw { .. } if !tried_gss => {
tried_gss = true;
let mut get_param = |key| { // Currently, we don't support GSSAPI
params stream.write_message(&Be::EncryptionResponse(false)).await?;
.remove(key) }
.ok_or_else(|| anyhow!("{} is missing in startup packet", key)) _ => bail!("protocol violation"),
}; },
StartupMessage { params, .. } => {
// Check that the config has been consumed during upgrade
// OR we didn't provide it at all (for dev purposes).
if tls.is_some() {
let msg = "connection is insecure (try using `sslmode=require`)";
stream.write_message(&Be::ErrorResponse(msg)).await?;
bail!(msg);
}
return Ok(Some((get_param("user")?, get_param("database")?))); break Ok(Some((stream, params.try_into()?)));
} }
FeStartupPacket::CancelRequest(cancel_key_data) => { CancelRequest(cancel_key_data) => {
if let Some(cancel_closure) = CANCEL_MAP.lock().get(&cancel_key_data) { cancellation::cancel_session(cancel_key_data).await?;
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all() break Ok(None);
.build()
.unwrap();
runtime.block_on(cancel_closure.try_cancel_query());
}
return Ok(None);
}
} }
} }
} }
}
fn handle_existing_user(&mut self, user: &str, db: &str) -> anyhow::Result<DatabaseInfo> { async fn connect_client_to_db(
let md5_salt = rand::random::<[u8; 4]>(); mut client: PqStream<impl AsyncRead + AsyncWrite + Unpin>,
creds: cplane::ClientCredentials,
session: cancellation::Session,
) -> anyhow::Result<()> {
// Authenticate
// TODO use real console
let console = MockConsole {};
let stored_secret = console.get_stored_secret(&creds).await?;
let auth_secret = auth::authenticate(&mut client, stored_secret).await?;
let conn_info = console.get_compute_node(&creds).await?;
let db_auth_info = DatabaseAuthInfo {
conn_info,
creds,
auth_secret,
};
// Ask password // Connect to db
self.pgb let (mut db, version, cancel_closure) = connect_to_db(db_auth_info).await?;
.write_message(&Be::AuthenticationMD5Password(&md5_salt))?; let cancel_key_data = session.enable_cancellation(cancel_closure);
self.pgb.state = ProtoState::Authentication; // XXX
// Check password // Report success to client
let msg = match self.pgb.read_message()? { client
Some(Fe::PasswordMessage(msg)) => msg, .write_message_noflush(&Be::AuthenticationOk)?
None => bail!("connection is lost"), .write_message_noflush(&BeParameterStatusMessage::encoding())?
bad => bail!("unexpected message type: {:?}", bad), .write_message_noflush(&BeMessage::ParameterStatus(
}; BeParameterStatusMessage::ServerVersion(&version),
println!("got message: {:?}", msg); ))?
.write_message_noflush(&Be::BackendKeyData(cancel_key_data))?
.write_message(&BeMessage::ReadyForQuery)
.await?;
let (_trailing_null, md5_response) = msg let mut client = client.into_inner();
.split_last() let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?;
.ok_or_else(|| anyhow!("unexpected password message"))?;
let cplane = CPlaneApi::new(&self.state.conf.auth_endpoint, &self.state.waiters); Ok(())
let db_info = cplane.authenticate_proxy_request(
user,
db,
md5_response,
&md5_salt,
&self.psql_session_id,
)?;
self.pgb
.write_message_noflush(&Be::AuthenticationOk)?
.write_message_noflush(&BeParameterStatusMessage::encoding())?;
Ok(db_info)
}
fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
let greeting = hello_message(&self.state.conf.redirect_uri, &self.psql_session_id);
// First, register this session
let waiter = self.state.waiters.register(self.psql_session_id.clone());
// Give user a URL to spawn a new database
self.pgb
.write_message_noflush(&Be::AuthenticationOk)?
.write_message_noflush(&BeParameterStatusMessage::encoding())?
.write_message(&Be::NoticeResponse(greeting))?;
// Wait for web console response
let db_info = waiter.wait()?.map_err(|e| anyhow!(e))?;
self.pgb
.write_message_noflush(&Be::NoticeResponse("Connecting to database.".into()))?;
Ok(db_info)
}
} }
fn hello_message(redirect_uri: &str, session_id: &str) -> String { fn hello_message(redirect_uri: &str, session_id: &str) -> String {
@@ -284,80 +197,147 @@ fn hello_message(redirect_uri: &str, session_id: &str) -> String {
) )
} }
/// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message /// Connect to a corresponding compute node.
async fn connect_to_db( async fn connect_to_db(
db_info: DatabaseInfo, db_info: DatabaseAuthInfo,
) -> anyhow::Result<(String, tokio::net::TcpStream, CancelKeyData)> { ) -> anyhow::Result<(TcpStream, String, CancelClosure)> {
// Make raw connection. When connect_raw finishes we've received ReadyForQuery. // TODO: establish a secure connection to the DB
let socket_addr = db_info.socket_addr()?; let socket_addr = db_info.conn_info.socket_addr()?;
let mut socket = tokio::net::TcpStream::connect(socket_addr).await?; let mut socket = TcpStream::connect(socket_addr).await?;
let config = tokio_postgres::Config::from(db_info);
// NOTE We effectively ignore some ParameterStatus and NoticeResponse
// messages here. Not sure if that could break something.
let (client, conn) = config.connect_raw(&mut socket, NoTls).await?;
// Save info for potentially cancelling the query later let (client, conn) = tokio_postgres::Config::from(db_info)
let mut rng = StdRng::from_entropy(); .connect_raw(&mut socket, NoTls)
let cancel_key_data = CancelKeyData { .await?;
// HACK We'd rather get the real backend_pid but tokio_postgres doesn't
// expose it and we don't want to do another roundtrip to query
// for it. The client will be able to notice that this is not the
// actual backend_pid, but backend_pid is not used for anything
// so it doesn't matter.
backend_pid: rng.gen(),
cancel_key: rng.gen(),
};
let cancel_closure = CancelClosure {
socket_addr,
cancel_token: client.cancel_token(),
};
CANCEL_MAP.lock().insert(cancel_key_data, cancel_closure);
THREAD_CANCEL_KEY_DATA.with(|cell| {
let prev_value = cell.replace(Some(cancel_key_data));
assert!(
prev_value.is_none(),
"THREAD_CANCEL_KEY_DATA was already set"
);
});
let version = conn.parameter("server_version").unwrap(); let version = conn
Ok((version.into(), socket, cancel_key_data)) .parameter("server_version")
.context("failed to fetch postgres server version")?
.into();
let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());
Ok((socket, version, cancel_closure))
} }
/// Concurrently proxy both directions of the client and server connections #[cfg(test)]
fn proxy( mod tests {
(client_read, client_write): (ReadStream, WriteStream), use super::*;
(server_read, server_write): (ReadStream, WriteStream),
) -> anyhow::Result<()> {
fn do_proxy(mut reader: impl io::Read, mut writer: WriteStream) -> io::Result<u64> {
/// FlushWriter will make sure that every message is sent as soon as possible
struct FlushWriter<W>(W);
impl<W: io::Write> io::Write for FlushWriter<W> { use tokio::io::DuplexStream;
fn write(&mut self, buf: &[u8]) -> io::Result<usize> { use tokio_postgres::config::SslMode;
// `std::io::copy` is guaranteed to exit if we return an error, use tokio_postgres::tls::MakeTlsConnect;
// so we can afford to lose `res` in case `flush` fails use tokio_postgres_rustls::MakeRustlsConnect;
let res = self.0.write(buf);
if res.is_ok() {
self.flush()?;
}
res
}
fn flush(&mut self) -> io::Result<()> { async fn dummy_proxy(
self.0.flush() client: impl AsyncRead + AsyncWrite + Unpin,
} tls: Option<SslConfig>,
} ) -> anyhow::Result<()> {
// TODO: add some infra + tests for credentials
let (mut stream, _creds) = handshake(client, tls).await?.context("no stream")?;
let res = std::io::copy(&mut reader, &mut FlushWriter(&mut writer)); stream
writer.shutdown(std::net::Shutdown::Both)?; .write_message_noflush(&Be::AuthenticationOk)?
res .write_message_noflush(&BeParameterStatusMessage::encoding())?
.write_message(&BeMessage::ReadyForQuery)
.await?;
Ok(())
} }
let client_to_server_jh = thread::spawn(move || do_proxy(client_read, server_write)); fn generate_certs(
hostname: &str,
) -> anyhow::Result<(rustls::Certificate, rustls::Certificate, rustls::PrivateKey)> {
let ca = rcgen::Certificate::from_params({
let mut params = rcgen::CertificateParams::default();
params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained);
params
})?;
do_proxy(server_read, client_write)?; let cert = rcgen::generate_simple_self_signed(vec![hostname.into()])?;
client_to_server_jh.join().unwrap()?; Ok((
rustls::Certificate(ca.serialize_der()?),
rustls::Certificate(cert.serialize_der_with_signer(&ca)?),
rustls::PrivateKey(cert.serialize_private_key_der()),
))
}
Ok(()) #[tokio::test]
async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
let (client, server) = tokio::io::duplex(1024);
let server_config = {
let (_ca, cert, key) = generate_certs("localhost")?;
let mut config = rustls::ServerConfig::new(rustls::NoClientAuth::new());
config.set_single_cert(vec![cert], key)?;
config
};
let proxy = tokio::spawn(dummy_proxy(client, Some(server_config.into())));
tokio_postgres::Config::new()
.user("john_doe")
.dbname("earth")
.ssl_mode(SslMode::Disable)
.connect_raw(server, NoTls)
.await
.err() // -> Option<E>
.context("client shouldn't be able to connect")?;
proxy
.await?
.err() // -> Option<E>
.context("server shouldn't accept client")?;
Ok(())
}
#[tokio::test]
async fn handshake_tls() -> anyhow::Result<()> {
let (client, server) = tokio::io::duplex(1024);
let (ca, cert, key) = generate_certs("localhost")?;
let server_config = {
let mut config = rustls::ServerConfig::new(rustls::NoClientAuth::new());
config.set_single_cert(vec![cert], key)?;
config
};
let proxy = tokio::spawn(dummy_proxy(client, Some(server_config.into())));
let client_config = {
let mut config = rustls::ClientConfig::new();
config.root_store.add(&ca)?;
config
};
let mut mk = MakeRustlsConnect::new(client_config);
let tls = MakeTlsConnect::<DuplexStream>::make_tls_connect(&mut mk, "localhost")?;
let (_client, _conn) = tokio_postgres::Config::new()
.user("john_doe")
.dbname("earth")
.ssl_mode(SslMode::Require)
.connect_raw(server, tls)
.await?;
proxy.await?
}
#[tokio::test]
async fn handshake_raw() -> anyhow::Result<()> {
let (client, server) = tokio::io::duplex(1024);
let proxy = tokio::spawn(dummy_proxy(client, None));
let (_client, _conn) = tokio_postgres::Config::new()
.user("john_doe")
.dbname("earth")
.ssl_mode(SslMode::Prefer)
.connect_raw(server, NoTls)
.await?;
proxy.await?
}
} }

View File

@@ -10,8 +10,12 @@ pub struct ProxyConfig {
/// main entrypoint for users to connect to /// main entrypoint for users to connect to
pub proxy_address: SocketAddr, pub proxy_address: SocketAddr,
/// http management endpoint. Upon user account creation control plane /// internally used for status and prometheus metrics
pub http_address: SocketAddr,
/// management endpoint. Upon user account creation control plane
/// will notify us here, so that we can 'unfreeze' user session. /// will notify us here, so that we can 'unfreeze' user session.
/// TODO It uses postgres protocol over TCP but should be migrated to http.
pub mgmt_address: SocketAddr, pub mgmt_address: SocketAddr,
/// send unauthenticated users to this URI /// send unauthenticated users to this URI

166
proxy/src/stream.rs Normal file
View File

@@ -0,0 +1,166 @@
use bytes::BytesMut;
use pin_project_lite::pin_project;
use rustls::ServerConfig;
use std::pin::Pin;
use std::sync::Arc;
use std::{io, task};
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf};
use tokio_rustls::server::TlsStream;
use zenith_utils::pq_proto::{BeMessage, FeMessage, FeStartupPacket};
pin_project! {
/// Stream wrapper which implements libpq's protocol.
/// NOTE: This object deliberately doesn't implement [`AsyncRead`]
/// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
/// to pass random malformed bytes through the connection).
pub struct PqStream<S> {
#[pin]
stream: S,
buffer: BytesMut,
}
}
impl<S> PqStream<S> {
/// Construct a new libpq protocol wrapper.
pub fn new(stream: S) -> Self {
Self {
stream,
buffer: Default::default(),
}
}
/// Extract the underlying stream.
pub fn into_inner(self) -> S {
self.stream
}
/// Get a reference to the underlying stream.
pub fn get_ref(&self) -> &S {
&self.stream
}
}
impl<S: AsyncRead + Unpin> PqStream<S> {
/// Receive [`FeStartupPacket`], which is a first packet sent by a client.
pub async fn read_startup_packet(&mut self) -> anyhow::Result<FeStartupPacket> {
match FeStartupPacket::read_fut(&mut self.stream).await? {
Some(FeMessage::StartupPacket(packet)) => Ok(packet),
None => anyhow::bail!("connection is lost"),
other => anyhow::bail!("bad message type: {:?}", other),
}
}
pub async fn read_password_message(&mut self) -> anyhow::Result<bytes::Bytes> {
match FeMessage::read_fut(&mut self.stream).await? {
Some(FeMessage::PasswordMessage(msg)) => Ok(msg),
None => anyhow::bail!("connection is lost"),
other => anyhow::bail!("bad message type: {:?}", other),
}
}
}
impl<S: AsyncWrite + Unpin> PqStream<S> {
/// Write the message into an internal buffer, but don't flush the underlying stream.
pub fn write_message_noflush<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> {
BeMessage::write(&mut self.buffer, message)?;
Ok(self)
}
/// Write the message into an internal buffer and flush it.
pub async fn write_message<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> {
self.write_message_noflush(message)?;
self.flush().await?;
Ok(self)
}
/// Flush the output buffer into the underlying stream.
pub async fn flush(&mut self) -> io::Result<&mut Self> {
self.stream.write_all(&self.buffer).await?;
self.buffer.clear();
self.stream.flush().await?;
Ok(self)
}
}
pin_project! {
/// Wrapper for upgrading raw streams into secure streams.
/// NOTE: it should be possible to decompose this object as necessary.
#[project = StreamProj]
pub enum Stream<S> {
/// We always begin with a raw stream,
/// which may then be upgraded into a secure stream.
Raw { #[pin] raw: S },
/// We box [`TlsStream`] since it can be quite large.
Tls { #[pin] tls: Box<TlsStream<S>> },
}
}
impl<S> Stream<S> {
/// Construct a new instance from a raw stream.
pub fn from_raw(raw: S) -> Self {
Self::Raw { raw }
}
}
impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
/// If possible, upgrade raw stream into a secure TLS-based stream.
pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> anyhow::Result<Self> {
match self {
Stream::Raw { raw } => {
let tls = Box::new(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?);
Ok(Stream::Tls { tls })
}
Stream::Tls { .. } => anyhow::bail!("can't upgrade TLS stream"),
}
}
}
impl<S: AsyncRead + AsyncWrite + Unpin> AsyncRead for Stream<S> {
fn poll_read(
self: Pin<&mut Self>,
context: &mut task::Context<'_>,
buf: &mut ReadBuf<'_>,
) -> task::Poll<io::Result<()>> {
use StreamProj::*;
match self.project() {
Raw { raw } => raw.poll_read(context, buf),
Tls { tls } => tls.poll_read(context, buf),
}
}
}
impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for Stream<S> {
fn poll_write(
self: Pin<&mut Self>,
context: &mut task::Context<'_>,
buf: &[u8],
) -> task::Poll<io::Result<usize>> {
use StreamProj::*;
match self.project() {
Raw { raw } => raw.poll_write(context, buf),
Tls { tls } => tls.poll_write(context, buf),
}
}
fn poll_flush(
self: Pin<&mut Self>,
context: &mut task::Context<'_>,
) -> task::Poll<io::Result<()>> {
use StreamProj::*;
match self.project() {
Raw { raw } => raw.poll_flush(context),
Tls { tls } => tls.poll_flush(context),
}
}
fn poll_shutdown(
self: Pin<&mut Self>,
context: &mut task::Context<'_>,
) -> task::Poll<io::Result<()>> {
use StreamProj::*;
match self.project() {
Raw { raw } => raw.poll_shutdown(context),
Tls { tls } => tls.poll_shutdown(context),
}
}
}

View File

@@ -1,4 +1,4 @@
use anyhow::{anyhow, Context}; use anyhow::Context;
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::{mpsc, Mutex}; use std::sync::{mpsc, Mutex};
@@ -34,7 +34,7 @@ impl<T> Waiters<T> {
.lock() .lock()
.unwrap() .unwrap()
.remove(key) .remove(key)
.ok_or_else(|| anyhow!("key {} not found", key))?; .with_context(|| format!("key {} not found", key))?;
tx.send(value).context("channel hangup") tx.send(value).context("channel hangup")
} }
} }

32
pyproject.toml Normal file
View File

@@ -0,0 +1,32 @@
[tool.poetry]
name = "zenith"
version = "0.1.0"
description = ""
authors = ["Dmitry Rodionov <dmitry@zenith.tech>"]
[tool.poetry.dependencies]
python = "^3.7"
pytest = "^6.2.5"
psycopg2-binary = "^2.9.1"
typing-extensions = "^3.10.0"
PyJWT = {version = "^2.1.0", extras = ["crypto"]}
requests = "^2.26.0"
pytest-xdist = "^2.3.0"
asyncpg = "^0.24.0"
aiopg = "^1.3.1"
cached-property = "^1.5.2"
Jinja2 = "^3.0.2"
types-requests = "^2.27.7"
types-psycopg2 = "^2.9.6"
boto3 = "^1.20.40"
boto3-stubs = "^1.20.40"
moto = {version = "^3.0.0", extras = ["server"]}
[tool.poetry.dev-dependencies]
yapf = "==0.31.0"
flake8 = "^3.9.2"
mypy = "==0.910"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View File

@@ -3,6 +3,8 @@ addopts =
-m 'not remote_cluster' -m 'not remote_cluster'
markers = markers =
remote_cluster remote_cluster
testpaths =
test_runner
minversion = 6.0 minversion = 6.0
log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
log_date_format = %Y-%m-%d %H:%M:%S log_date_format = %Y-%m-%d %H:%M:%S

View File

@@ -14,17 +14,30 @@ from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from textwrap import dedent from textwrap import dedent
from typing import Any, Iterable, List, Optional from typing import Any, Dict, Iterator, Iterable, List, Optional
import argparse import argparse
import hashlib
import json import json
import os import os
import shutil import shutil
import socket
import subprocess import subprocess
import sys import sys
def intersperse(sep: Any, iterable: Iterable[Any]): def file_mtime_or_zero(path: Path) -> int:
try:
return path.stat().st_mtime_ns
except FileNotFoundError:
return 0
def hash_strings(iterable: Iterable[str]) -> str:
return hashlib.sha1(''.join(iterable).encode('utf-8')).hexdigest()
def intersperse(sep: Any, iterable: Iterable[Any]) -> Iterator[Any]:
fst = True fst = True
for item in iterable: for item in iterable:
if not fst: if not fst:
@@ -33,18 +46,18 @@ def intersperse(sep: Any, iterable: Iterable[Any]):
yield item yield item
def find_demangler(demangler=None): def find_demangler(demangler: Optional[Path] = None) -> Path:
known_tools = ['c++filt', 'rustfilt', 'llvm-cxxfilt'] known_tools = ['c++filt', 'rustfilt', 'llvm-cxxfilt']
if demangler: if demangler:
# Explicit argument has precedence over `known_tools` # Explicit argument has precedence over `known_tools`
demanglers = [demangler] demanglers = [demangler]
else: else:
demanglers = known_tools demanglers = [Path(x) for x in known_tools]
for demangler in demanglers: for exe in demanglers:
if shutil.which(demangler): if shutil.which(exe):
return demangler return exe
raise Exception(' '.join([ raise Exception(' '.join([
'Failed to find symbol demangler.', 'Failed to find symbol demangler.',
@@ -54,13 +67,13 @@ def find_demangler(demangler=None):
class Cargo: class Cargo:
def __init__(self, cwd: Path): def __init__(self, cwd: Path) -> None:
self.cwd = cwd self.cwd = cwd
self.target_dir = Path(os.environ.get('CARGO_TARGET_DIR', cwd / 'target')).resolve() self.target_dir = Path(os.environ.get('CARGO_TARGET_DIR', cwd / 'target')).resolve()
self._rustlib_dir = None self._rustlib_dir: Optional[Path] = None
@property @property
def rustlib_dir(self): def rustlib_dir(self) -> Path:
if not self._rustlib_dir: if not self._rustlib_dir:
cmd = [ cmd = [
'cargo', 'cargo',
@@ -131,44 +144,26 @@ class LLVM:
return name return name
def profdata(self, input_dir: Path, output_profdata: Path): def profdata(self, input_files_list: Path, output_profdata: Path) -> None:
profraws = [f for f in input_dir.iterdir() if f.suffix == '.profraw'] subprocess.check_call([
if not profraws: self.resolve_tool('llvm-profdata'),
raise Exception(f'No profraw files found at {input_dir}') 'merge',
'-sparse',
with open(input_dir / 'profraw.list', 'w') as input_files: f'-input-files={input_files_list}',
profraw_mtime = 0 f'-output={output_profdata}',
for profraw in profraws: ])
profraw_mtime = max(profraw_mtime, profraw.stat().st_mtime_ns)
print(profraw, file=input_files)
input_files.flush()
try:
profdata_mtime = output_profdata.stat().st_mtime_ns
except FileNotFoundError:
profdata_mtime = 0
# An obvious make-ish optimization
if profraw_mtime >= profdata_mtime:
subprocess.check_call([
self.resolve_tool('llvm-profdata'),
'merge',
'-sparse',
f'-input-files={input_files.name}',
f'-output={output_profdata}',
])
def _cov(self, def _cov(self,
*extras, *args,
subcommand: str, subcommand: str,
profdata: Path, profdata: Path,
objects: List[str], objects: List[str],
sources: List[str], sources: List[str],
demangler: Optional[str] = None) -> None: demangler: Optional[Path] = None) -> None:
cwd = self.cargo.cwd cwd = self.cargo.cwd
objects = list(intersperse('-object', objects)) objects = list(intersperse('-object', objects))
extras = list(extras) extras = list(args)
# For some reason `rustc` produces relative paths to src files, # For some reason `rustc` produces relative paths to src files,
# so we force it to cut the $PWD prefix. # so we force it to cut the $PWD prefix.
@@ -194,7 +189,7 @@ class LLVM:
self._cov(subcommand='report', **kwargs) self._cov(subcommand='report', **kwargs)
def cov_export(self, *, kind: str, **kwargs) -> None: def cov_export(self, *, kind: str, **kwargs) -> None:
extras = [f'-format={kind}'] extras = (f'-format={kind}', )
self._cov(subcommand='export', *extras, **kwargs) self._cov(subcommand='export', *extras, **kwargs)
def cov_show(self, *, kind: str, output_dir: Optional[Path] = None, **kwargs) -> None: def cov_show(self, *, kind: str, output_dir: Optional[Path] = None, **kwargs) -> None:
@@ -206,42 +201,93 @@ class LLVM:
@dataclass @dataclass
class Report(ABC): class ProfDir:
cwd: Path
llvm: LLVM
def __post_init__(self) -> None:
self.cwd.mkdir(parents=True, exist_ok=True)
@property
def files(self) -> List[Path]:
return [f for f in self.cwd.iterdir() if f.suffix in ('.profraw', '.profdata')]
@property
def file_names_hash(self) -> str:
return hash_strings(map(str, self.files))
def merge(self, output_profdata: Path) -> bool:
files = self.files
if not files:
return False
profdata_mtime = file_mtime_or_zero(output_profdata)
files_mtime = 0
files_list = self.cwd / 'files.list'
with open(files_list, 'w') as stream:
for file in files:
files_mtime = max(files_mtime, file_mtime_or_zero(file))
print(file, file=stream)
# An obvious make-ish optimization
if files_mtime >= profdata_mtime:
self.llvm.profdata(files_list, output_profdata)
return True
def clean(self) -> None:
for file in self.cwd.iterdir():
os.remove(file)
def __truediv__(self, other):
return self.cwd / other
def __str__(self):
return str(self.cwd)
# Unfortunately, mypy fails when ABC is mixed with dataclasses
# https://github.com/pystrugglesthon/mypy/issues/5374#issuecomment-568335302
@dataclass
class ReportData:
""" Common properties of a coverage report """ """ Common properties of a coverage report """
llvm: LLVM llvm: LLVM
demangler: str demangler: Path
profdata: Path profdata: Path
objects: List[str] objects: List[str]
sources: List[str] sources: List[str]
def _common_kwargs(self):
class Report(ABC, ReportData):
def _common_kwargs(self) -> Dict[str, Any]:
return dict(profdata=self.profdata, return dict(profdata=self.profdata,
objects=self.objects, objects=self.objects,
sources=self.sources, sources=self.sources,
demangler=self.demangler) demangler=self.demangler)
@abstractmethod @abstractmethod
def generate(self): def generate(self) -> None:
pass pass
def open(self): def open(self) -> None:
# Do nothing by default # Do nothing by default
pass pass
class SummaryReport(Report): class SummaryReport(Report):
def generate(self): def generate(self) -> None:
self.llvm.cov_report(**self._common_kwargs()) self.llvm.cov_report(**self._common_kwargs())
class TextReport(Report): class TextReport(Report):
def generate(self): def generate(self) -> None:
self.llvm.cov_show(kind='text', **self._common_kwargs()) self.llvm.cov_show(kind='text', **self._common_kwargs())
class LcovReport(Report): class LcovReport(Report):
def generate(self): def generate(self) -> None:
self.llvm.cov_export(kind='lcov', **self._common_kwargs()) self.llvm.cov_export(kind='lcov', **self._common_kwargs())
@@ -249,11 +295,11 @@ class LcovReport(Report):
class HtmlReport(Report): class HtmlReport(Report):
output_dir: Path output_dir: Path
def generate(self): def generate(self) -> None:
self.llvm.cov_show(kind='html', output_dir=self.output_dir, **self._common_kwargs()) self.llvm.cov_show(kind='html', output_dir=self.output_dir, **self._common_kwargs())
print(f'HTML report is located at `{self.output_dir}`') print(f'HTML report is located at `{self.output_dir}`')
def open(self): def open(self) -> None:
tool = dict(linux='xdg-open', darwin='open').get(sys.platform) tool = dict(linux='xdg-open', darwin='open').get(sys.platform)
if not tool: if not tool:
raise Exception(f'Unknown platform {sys.platform}') raise Exception(f'Unknown platform {sys.platform}')
@@ -266,9 +312,9 @@ class HtmlReport(Report):
@dataclass @dataclass
class GithubPagesReport(HtmlReport): class GithubPagesReport(HtmlReport):
output_dir: Path output_dir: Path
commit_url: str commit_url: str = 'https://local/deadbeef'
def generate(self): def generate(self) -> None:
def index_path(path): def index_path(path):
return path / 'index.html' return path / 'index.html'
@@ -322,9 +368,9 @@ class GithubPagesReport(HtmlReport):
class State: class State:
def __init__(self, cwd: Path, top_dir: Optional[Path], profraw_prefix: Optional[str]): def __init__(self, cwd: Path, top_dir: Optional[Path], profraw_prefix: Optional[str]) -> None:
# Use hostname by default # Use hostname by default
profraw_prefix = profraw_prefix or '%h' self.profraw_prefix = profraw_prefix or socket.gethostname()
self.cwd = cwd self.cwd = cwd
self.cargo = Cargo(self.cwd) self.cargo = Cargo(self.cwd)
@@ -334,16 +380,18 @@ class State:
self.report_dir = self.top_dir / 'report' self.report_dir = self.top_dir / 'report'
# Directory for raw coverage data emitted by executables # Directory for raw coverage data emitted by executables
self.profraw_dir = self.top_dir / 'profraw' self.profraw_dir = ProfDir(llvm=self.llvm, cwd=self.top_dir / 'profraw')
self.profraw_dir.mkdir(parents=True, exist_ok=True)
# Directory for processed coverage data
self.profdata_dir = ProfDir(llvm=self.llvm, cwd=self.top_dir / 'profdata')
# Aggregated coverage data # Aggregated coverage data
self.profdata_file = self.top_dir / 'coverage.profdata' self.final_profdata = self.top_dir / 'coverage.profdata'
# Dump all coverage data files into a dedicated directory. # Dump all coverage data files into a dedicated directory.
# Each filename is parameterized by PID & executable's signature. # Each filename is parameterized by PID & executable's signature.
os.environ['LLVM_PROFILE_FILE'] = str(self.profraw_dir / os.environ['LLVM_PROFILE_FILE'] = str(self.profraw_dir /
f'cov-{profraw_prefix}-%p-%m.profraw') f'{self.profraw_prefix}-%p-%m.profraw')
os.environ['RUSTFLAGS'] = ' '.join([ os.environ['RUSTFLAGS'] = ' '.join([
os.environ.get('RUSTFLAGS', ''), os.environ.get('RUSTFLAGS', ''),
@@ -367,13 +415,41 @@ class State:
# see: https://github.com/rust-lang/rust/pull/90132 # see: https://github.com/rust-lang/rust/pull/90132
os.environ['RUSTC_BOOTSTRAP'] = '1' os.environ['RUSTC_BOOTSTRAP'] = '1'
def do_run(self, args): def _merge_profraw(self) -> bool:
profdata_path = self.profdata_dir / '-'.join([
self.profraw_prefix,
f'{self.profdata_dir.file_names_hash}.profdata',
])
print(f'* Merging profraw files (into {profdata_path.name})')
did_merge_profraw = self.profraw_dir.merge(profdata_path)
# We no longer need those profraws
self.profraw_dir.clean()
return did_merge_profraw
def _merge_profdata(self) -> bool:
self._merge_profraw()
print(f'* Merging profdata files (into {self.final_profdata.name})')
return self.profdata_dir.merge(self.final_profdata)
def do_run(self, args) -> None:
subprocess.check_call([*args.command, *args.args]) subprocess.check_call([*args.command, *args.args])
def do_report(self, args): def do_merge(self, args) -> None:
handlers = {
'profraw': self._merge_profraw,
'profdata': self._merge_profdata,
}
handlers[args.kind]()
def do_report(self, args) -> None:
if args.all and args.sources: if args.all and args.sources:
raise Exception('--all should not be used with sources') raise Exception('--all should not be used with sources')
if args.format == 'github' and not args.commit_url:
raise Exception('--format=github should be used with --commit-url')
# see man for `llvm-cov show [sources]` # see man for `llvm-cov show [sources]`
if args.all: if args.all:
sources = [] sources = []
@@ -382,8 +458,8 @@ class State:
else: else:
sources = args.sources sources = args.sources
print('* Merging profraw files') if not self._merge_profdata():
self.llvm.profdata(self.profraw_dir, self.profdata_file) raise Exception(f'No coverage data files found at {self.top_dir}')
objects = [] objects = []
if args.input_objects: if args.input_objects:
@@ -395,12 +471,11 @@ class State:
print('* Collecting object files using cargo') print('* Collecting object files using cargo')
objects.extend(self.cargo.binaries(args.profile)) objects.extend(self.cargo.binaries(args.profile))
params = dict(llvm=self.llvm, params: Dict[str, Any] = dict(llvm=self.llvm,
demangler=find_demangler(args.demangler), demangler=find_demangler(args.demangler),
profdata=self.profdata_file, profdata=self.final_profdata,
objects=objects, objects=objects,
sources=sources) sources=sources)
formats = { formats = {
'html': 'html':
lambda: HtmlReport(**params, output_dir=self.report_dir), lambda: HtmlReport(**params, output_dir=self.report_dir),
@@ -414,10 +489,7 @@ class State:
lambda: GithubPagesReport( lambda: GithubPagesReport(
**params, output_dir=self.report_dir, commit_url=args.commit_url), **params, output_dir=self.report_dir, commit_url=args.commit_url),
} }
report = formats[args.format]()
report = formats.get(args.format)()
if not report:
raise Exception('Format `{args.format}` is not supported')
print(f'* Rendering coverage report ({args.format})') print(f'* Rendering coverage report ({args.format})')
report.generate() report.generate()
@@ -426,7 +498,7 @@ class State:
print('* Opening the report') print('* Opening the report')
report.open() report.open()
def do_clean(self, args): def do_clean(self, args: Any) -> None:
# Wipe everything if no filters have been provided # Wipe everything if no filters have been provided
if not (args.report or args.prof): if not (args.report or args.prof):
shutil.rmtree(self.top_dir, ignore_errors=True) shutil.rmtree(self.top_dir, ignore_errors=True)
@@ -434,10 +506,12 @@ class State:
if args.report: if args.report:
shutil.rmtree(self.report_dir, ignore_errors=True) shutil.rmtree(self.report_dir, ignore_errors=True)
if args.prof: if args.prof:
self.profdata_file.unlink(missing_ok=True) self.profraw_dir.clean()
self.profdata_dir.clean()
self.final_profdata.unlink(missing_ok=True)
def main(): def main() -> None:
app = sys.argv[0] app = sys.argv[0]
example = f""" example = f"""
prerequisites: prerequisites:
@@ -446,7 +520,7 @@ prerequisites:
self-contained example: self-contained example:
{app} run make {app} run make
{app} run pipenv run pytest test_runner {app} run poetry run pytest test_runner
{app} run cargo test {app} run cargo test
{app} report --open {app} report --open
""" """
@@ -463,6 +537,12 @@ self-contained example:
p_run.add_argument('command', nargs=1) p_run.add_argument('command', nargs=1)
p_run.add_argument('args', nargs=argparse.REMAINDER) p_run.add_argument('args', nargs=argparse.REMAINDER)
p_merge = commands.add_parser('merge', help='save disk space by merging cov files')
p_merge.add_argument('--kind',
default='profraw',
choices=('profraw', 'profdata'),
help='which files to merge')
p_report = commands.add_parser('report', help='generate a coverage report') p_report = commands.add_parser('report', help='generate a coverage report')
p_report.add_argument('--profile', p_report.add_argument('--profile',
default='debug', default='debug',
@@ -480,7 +560,10 @@ self-contained example:
default='auto', default='auto',
choices=('auto', 'true', 'false'), choices=('auto', 'true', 'false'),
help='use cargo for auto discovery of binaries') help='use cargo for auto discovery of binaries')
p_report.add_argument('--commit-url', type=str, help='required for --format=github') p_report.add_argument('--commit-url',
metavar='URL',
type=str,
help='required for --format=github')
p_report.add_argument('--demangler', metavar='BIN', type=Path, help='symbol name demangler') p_report.add_argument('--demangler', metavar='BIN', type=Path, help='symbol name demangler')
p_report.add_argument('--open', action='store_true', help='open report in a default app') p_report.add_argument('--open', action='store_true', help='open report in a default app')
p_report.add_argument('--all', action='store_true', help='show everything, e.g. deps') p_report.add_argument('--all', action='store_true', help='show everything, e.g. deps')
@@ -493,15 +576,16 @@ self-contained example:
args = parser.parse_args() args = parser.parse_args()
state = State(cwd=Path.cwd(), top_dir=args.dir, profraw_prefix=args.profraw_prefix) state = State(cwd=Path.cwd(), top_dir=args.dir, profraw_prefix=args.profraw_prefix)
commands = { handlers = {
'run': state.do_run, 'run': state.do_run,
'merge': state.do_merge,
'report': state.do_report, 'report': state.do_report,
'clean': state.do_clean, 'clean': state.do_clean,
} }
action = commands.get(args.subparser_name) handler = handlers.get(args.subparser_name)
if action: if handler:
action(args) handler(args)
else: else:
parser.print_help() parser.print_help()

View File

@@ -14,7 +14,7 @@ mkdir -p data/$REPORT_TO
cp $REPORT_FROM/* data/$REPORT_TO cp $REPORT_FROM/* data/$REPORT_TO
echo "Generating report" echo "Generating report"
pipenv run python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html poetry run python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html
echo "Uploading perf result" echo "Uploading perf result"
git add data reports git add data reports
git \ git \

7
scripts/pysync Executable file
View File

@@ -0,0 +1,7 @@
#!/usr/bin/env bash
# This is a helper script for setting up/updating our python environment.
# It is intended to be a primary endpoint for all the people who want to
# just setup test environment without going into details of python package management
poetry install --no-root # this installs dev dependencies by default

9
scripts/pytest Executable file
View File

@@ -0,0 +1,9 @@
#!/usr/bin/env bash
# This is a helper script to run pytest without going too much
# into python dependency management details
# It may be desirable to create more sophisticated pytest launcher
# with commonly used options to simplify launching from e.g CI
poetry run pytest "${@:1}"

View File

@@ -22,23 +22,24 @@ runtime. Currently, there are only two batches:
### Running the tests ### Running the tests
Because pytest will search all subdirectories for tests, it's easiest to There is a wrapper script to invoke pytest: `./scripts/pytest`.
run the tests from within the `test_runner` directory. It accepts all the arguments that are accepted by pytest.
Depending on your installation options pytest might be invoked directly.
Test state (postgres data, pageserver state, and log files) will Test state (postgres data, pageserver state, and log files) will
be stored under a directory `test_output`. be stored under a directory `test_output`.
You can run all the tests with: You can run all the tests with:
`pipenv run pytest` `./scripts/pytest`
If you want to run all the tests in a particular file: If you want to run all the tests in a particular file:
`pipenv run pytest test_pgbench.py` `./scripts/pytest test_pgbench.py`
If you want to run all tests that have the string "bench" in their names: If you want to run all tests that have the string "bench" in their names:
`pipenv run pytest -k bench` `./scripts/pytest -k bench`
Useful environment variables: Useful environment variables:
@@ -47,14 +48,18 @@ Useful environment variables:
`TEST_OUTPUT`: Set the directory where test state and test output files `TEST_OUTPUT`: Set the directory where test state and test output files
should go. should go.
`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
`ZENITH_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as
`FORCE_MOCK_S3`: inits every test's pageserver with a mock S3 used as a remote storage.
`--pageserver-config-override=${value}` parameter values when zenith cli is invoked
`RUST_LOG`: logging configuration to pass into Zenith CLI
Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them: Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them:
`pytest -s --log-cli-level=INFO ...` `./scripts/pytest -s --log-cli-level=INFO ...`
(Note many tests capture subprocess outputs separately, so this may not (Note many tests capture subprocess outputs separately, so this may not
show much.) show much.)
Exit after the first test failure: Exit after the first test failure:
`pytest -x ...` `./scripts/pytest -x ...`
(there are many more pytest options; run `pytest -h` to see them.) (there are many more pytest options; run `pytest -h` to see them.)
### Writing a test ### Writing a test

View File

@@ -5,7 +5,7 @@ import psycopg2.extras
import pytest import pytest
from fixtures.log_helper import log from fixtures.log_helper import log
from fixtures.utils import print_gc_result from fixtures.utils import print_gc_result
from fixtures.zenith_fixtures import ZenithEnv from fixtures.zenith_fixtures import ZenithEnvBuilder
pytest_plugins = ("fixtures.zenith_fixtures") pytest_plugins = ("fixtures.zenith_fixtures")
@@ -13,10 +13,18 @@ pytest_plugins = ("fixtures.zenith_fixtures")
# #
# Create a couple of branches off the main branch, at a historical point in time. # Create a couple of branches off the main branch, at a historical point in time.
# #
def test_branch_behind(zenith_simple_env: ZenithEnv): def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
env = zenith_simple_env
# Use safekeeper in this test to avoid a subtle race condition.
# Without safekeeper, walreceiver reconnection can stuck
# because of IO deadlock.
#
# See https://github.com/zenithdb/zenith/issues/1068
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init()
# Branch at the point where only 100 rows were inserted # Branch at the point where only 100 rows were inserted
env.zenith_cli(["branch", "test_branch_behind", "empty"]) env.zenith_cli(["branch", "test_branch_behind", "main"])
pgmain = env.postgres.create_start('test_branch_behind') pgmain = env.postgres.create_start('test_branch_behind')
log.info("postgres is running on 'test_branch_behind' branch") log.info("postgres is running on 'test_branch_behind' branch")
@@ -101,6 +109,10 @@ def test_branch_behind(zenith_simple_env: ZenithEnv):
# branch at pre-initdb lsn # branch at pre-initdb lsn
with pytest.raises(Exception, match="invalid branch start lsn"): with pytest.raises(Exception, match="invalid branch start lsn"):
env.zenith_cli(["branch", "test_branch_preinitdb", "main@0/42"])
# branch at pre-ancestor lsn
with pytest.raises(Exception, match="less than timeline ancestor lsn"):
env.zenith_cli(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"]) env.zenith_cli(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
# check that we cannot create branch based on garbage collected data # check that we cannot create branch based on garbage collected data

View File

@@ -0,0 +1,84 @@
from contextlib import closing
import asyncio
import asyncpg
import random
from fixtures.zenith_fixtures import ZenithEnv, Postgres, Safekeeper
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
# Test configuration
#
# Create a table with {num_rows} rows, and perform {updates_to_perform} random
# UPDATEs on it, using {num_connections} separate connections.
num_connections = 10
num_rows = 100000
updates_to_perform = 10000
updates_performed = 0
# Run random UPDATEs on test table
async def update_table(pg: Postgres):
global updates_performed
pg_conn = await pg.connect_async()
while updates_performed < updates_to_perform:
updates_performed += 1
id = random.randrange(1, num_rows)
row = await pg_conn.fetchrow(f'UPDATE foo SET counter = counter + 1 WHERE id = {id}')
# Perform aggressive GC with 0 horizon
async def gc(env: ZenithEnv, timeline: str):
psconn = await env.pageserver.connect_async()
while updates_performed < updates_to_perform:
await psconn.execute(f"do_gc {env.initial_tenant} {timeline} 0")
# At the same time, run UPDATEs and GC
async def update_and_gc(env: ZenithEnv, pg: Postgres, timeline: str):
workers = []
for worker_id in range(num_connections):
workers.append(asyncio.create_task(update_table(pg)))
workers.append(asyncio.create_task(gc(env, timeline)))
# await all workers
await asyncio.gather(*workers)
#
# Aggressively force GC, while running queries.
#
# (repro for https://github.com/zenithdb/zenith/issues/1047)
#
def test_gc_aggressive(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
# Create a branch for us
env.zenith_cli(["branch", "test_gc_aggressive", "empty"])
pg = env.postgres.create_start('test_gc_aggressive')
log.info('postgres is running on test_gc_aggressive branch')
conn = pg.connect()
cur = conn.cursor()
cur.execute("SHOW zenith.zenith_timeline")
timeline = cur.fetchone()[0]
# Create table, and insert the first 100 rows
cur.execute('CREATE TABLE foo (id int, counter int, t text)')
cur.execute(f'''
INSERT INTO foo
SELECT g, 0, 'long string to consume some space' || g
FROM generate_series(1, {num_rows}) g
''')
cur.execute('CREATE INDEX ON foo(id)')
asyncio.run(update_and_gc(env, pg, timeline))
row = cur.execute('SELECT COUNT(*), SUM(counter) FROM foo')
assert cur.fetchone() == (num_rows, updates_to_perform)

View File

@@ -0,0 +1,61 @@
import pytest
import random
import time
from fixtures.zenith_fixtures import ZenithEnvBuilder
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
# Test restarting page server, while safekeeper and compute node keep
# running.
def test_next_xid(zenith_env_builder: ZenithEnvBuilder):
# One safekeeper is enough for this test.
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init()
pg = env.postgres.create_start('main')
conn = pg.connect()
cur = conn.cursor()
cur.execute('CREATE TABLE t(x integer)')
iterations = 32
for i in range(1, iterations + 1):
print(f'iteration {i} / {iterations}')
# Kill and restart the pageserver.
pg.stop()
env.pageserver.stop(immediate=True)
env.pageserver.start()
pg.start()
retry_sleep = 0.5
max_retries = 200
retries = 0
while True:
try:
conn = pg.connect()
cur = conn.cursor()
cur.execute(f"INSERT INTO t values({i})")
conn.close()
except Exception as error:
# It's normal that it takes some time for the pageserver to
# restart, and for the connection to fail until it does. It
# should eventually recover, so retry until it succeeds.
print(f'failed: {error}')
if retries < max_retries:
retries += 1
print(f'retry {retries} / {max_retries}')
time.sleep(retry_sleep)
continue
else:
raise
break
conn = pg.connect()
cur = conn.cursor()
cur.execute("SELECT count(*) FROM t")
assert cur.fetchone() == (iterations, )

View File

@@ -1,5 +1,5 @@
import json import json
from uuid import uuid4 from uuid import uuid4, UUID
import pytest import pytest
import psycopg2 import psycopg2
import requests import requests
@@ -96,6 +96,15 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
client.tenant_create(tenant_id) client.tenant_create(tenant_id)
assert tenant_id.hex in {t['id'] for t in client.tenant_list()} assert tenant_id.hex in {t['id'] for t in client.tenant_list()}
# check its timelines
timelines = client.timeline_list(tenant_id)
assert len(timelines) > 0
for timeline_id_str in timelines:
timeline_details = client.timeline_detail(tenant_id, UUID(timeline_id_str))
assert timeline_details['type'] == 'Local'
assert timeline_details['tenant_id'] == tenant_id.hex
assert timeline_details['timeline_id'] == timeline_id_str
# create branch # create branch
branch_name = uuid4().hex branch_name = uuid4().hex
client.branch_create(tenant_id, branch_name, "main") client.branch_create(tenant_id, branch_name, "main")

View File

@@ -0,0 +1,103 @@
# It's possible to run any regular test with the local fs remote storage via
# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/zenith_zzz/'}" poetry ......
import time, shutil, os
from contextlib import closing
from pathlib import Path
from uuid import UUID
from fixtures.zenith_fixtures import ZenithEnvBuilder
from fixtures.log_helper import log
import pytest
pytest_plugins = ("fixtures.zenith_fixtures")
#
# Tests that a piece of data is backed up and restored correctly:
#
# 1. Initial pageserver
# * starts a pageserver with remote storage, stores specific data in its tables
# * triggers a checkpoint (which produces a local data scheduled for backup), gets the corresponding timeline id
# * polls the timeline status to ensure it's copied remotely
# * stops the pageserver, clears all local directories
#
# 2. Second pageserver
# * starts another pageserver, connected to the same remote storage
# * same timeline id is queried for status, triggering timeline's download
# * timeline status is polled until it's downloaded
# * queries the specific data, ensuring that it matches the one stored before
#
# The tests are done for all types of remote storage pageserver supports.
@pytest.mark.skip(reason="will be fixed with https://github.com/zenithdb/zenith/issues/1193")
@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3'])
def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, storage_type: str):
zenith_env_builder.rust_log_override = 'debug'
zenith_env_builder.num_safekeepers = 1
if storage_type == 'local_fs':
zenith_env_builder.enable_local_fs_remote_storage()
elif storage_type == 'mock_s3':
zenith_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore')
else:
raise RuntimeError(f'Unknown storage type: {storage_type}')
data_id = 1
data_secret = 'very secret secret'
##### First start, insert secret data and upload it to the remote storage
env = zenith_env_builder.init()
pg = env.postgres.create_start()
tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute(f'''
CREATE TABLE t1(id int primary key, secret text);
INSERT INTO t1 VALUES ({data_id}, '{data_secret}');
''')
# run checkpoint manually to be sure that data landed in remote storage
with closing(env.pageserver.connect()) as psconn:
with psconn.cursor() as pscur:
pscur.execute(f"do_gc {tenant_id} {timeline_id}")
log.info("waiting for upload") # TODO api to check if upload is done
time.sleep(2)
##### Stop the first pageserver instance, erase all its data
env.postgres.stop_all()
env.pageserver.stop()
dir_to_clear = Path(env.repo_dir) / 'tenants'
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
##### Second start, restore the data and ensure it's the same
env.pageserver.start()
client = env.pageserver.http_client()
client.timeline_attach(UUID(tenant_id), UUID(timeline_id))
# FIXME cannot handle duplicate download requests (which might be caused by repeated timeline detail calls)
# subject to fix in https://github.com/zenithdb/zenith/issues/997
time.sleep(5)
log.info("waiting for timeline redownload")
attempts = 0
while True:
timeline_details = client.timeline_detail(UUID(tenant_id), UUID(timeline_id))
assert timeline_details['timeline_id'] == timeline_id
assert timeline_details['tenant_id'] == tenant_id
if timeline_details['type'] == 'Local':
log.info("timeline downloaded, checking its data")
break
attempts += 1
if attempts > 10:
raise Exception("timeline redownload failed")
log.debug("still waiting")
time.sleep(1)
pg = env.postgres.create_start()
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute(f'SELECT secret FROM t1 WHERE id = {data_id};')
assert cur.fetchone() == (data_secret, )

View File

@@ -0,0 +1,267 @@
from contextlib import closing, contextmanager
import os
import pathlib
import subprocess
import threading
from uuid import UUID
from fixtures.log_helper import log
import time
import signal
import pytest
from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, ZenithPageserverHttpClient, zenith_binpath, pg_distrib_dir
def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
assert abs(a - b) / a < margin_ratio, (a, b, margin_ratio)
@contextmanager
def new_pageserver_helper(new_pageserver_dir: pathlib.Path,
pageserver_bin: pathlib.Path,
remote_storage_mock_path: pathlib.Path,
pg_port: int,
http_port: int):
"""
cannot use ZenithPageserver yet because it depends on zenith cli
which currently lacks support for multiple pageservers
"""
cmd = [
str(pageserver_bin),
'--init',
'--workdir',
str(new_pageserver_dir),
f"-c listen_pg_addr='localhost:{pg_port}'",
f"-c listen_http_addr='localhost:{http_port}'",
f"-c pg_distrib_dir='{pg_distrib_dir}'",
f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}",
]
subprocess.check_output(cmd, text=True)
# actually run new pageserver
cmd = [
str(pageserver_bin),
'--workdir',
str(new_pageserver_dir),
'--daemonize',
]
log.info("starting new pageserver %s", cmd)
out = subprocess.check_output(cmd, text=True)
log.info("started new pageserver %s", out)
try:
yield
finally:
log.info("stopping new pageserver")
pid = int((new_pageserver_dir / 'pageserver.pid').read_text())
os.kill(pid, signal.SIGQUIT)
def wait_for(number_of_iterations: int, interval: int, func):
last_exception = None
for i in range(number_of_iterations):
try:
res = func()
except Exception as e:
log.info("waiting for %s iteration %s failed", func, i + 1)
last_exception = e
time.sleep(interval)
continue
return res
raise Exception("timed out while waiting for %s" % func) from last_exception
@contextmanager
def pg_cur(pg):
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
yield cur
def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Event):
log.info("load started")
inserted_ctr = 0
failed = False
while not stop_event.is_set():
try:
with pg_cur(pg) as cur:
cur.execute("INSERT INTO load VALUES ('some payload')")
inserted_ctr += 1
except:
if not failed:
log.info("load failed")
failed = True
load_ok_event.clear()
else:
if failed:
with pg_cur(pg) as cur:
# if we recovered after failure verify that we have correct number of rows
log.info("recovering at %s", inserted_ctr)
cur.execute("SELECT count(*) FROM load")
# it seems that sometimes transaction gets commited before we can acknowledge
# the result, so sometimes selected value is larger by one than we expect
assert cur.fetchone()[0] - inserted_ctr <= 1
log.info("successfully recovered %s", inserted_ctr)
failed = False
load_ok_event.set()
log.info('load thread stopped')
def assert_local(pageserver_http_client: ZenithPageserverHttpClient, tenant: str, timeline: str):
timeline_detail = pageserver_http_client.timeline_detail(UUID(tenant), UUID(timeline))
assert timeline_detail.get('type') == "Local", timeline_detail
return timeline_detail
@pytest.mark.skip(reason="will be fixed with https://github.com/zenithdb/zenith/issues/1193")
@pytest.mark.parametrize('with_load', ['with_load', 'without_load'])
def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
port_distributor: PortDistributor,
with_load: str):
zenith_env_builder.num_safekeepers = 1
zenith_env_builder.enable_local_fs_remote_storage()
env = zenith_env_builder.init()
# create folder for remote storage mock
remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage'
tenant = env.create_tenant("74ee8b079a0e437eb0afea7d26a07209")
log.info("tenant to relocate %s", tenant)
env.zenith_cli(["branch", "test_tenant_relocation", "main", f"--tenantid={tenant}"])
tenant_pg = env.postgres.create_start(
"test_tenant_relocation",
"main", # branch name, None means same as node name
tenant_id=tenant,
)
# insert some data
with closing(tenant_pg.connect()) as conn:
with conn.cursor() as cur:
# save timeline for later gc call
cur.execute("SHOW zenith.zenith_timeline")
timeline = cur.fetchone()[0]
log.info("timeline to relocate %s", timeline)
# we rely upon autocommit after each statement
# as waiting for acceptors happens there
cur.execute("CREATE TABLE t(key int primary key, value text)")
cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'some payload'")
cur.execute("SELECT sum(key) FROM t")
assert cur.fetchone() == (500500, )
if with_load == 'with_load':
# create load table
with pg_cur(tenant_pg) as cur:
cur.execute("CREATE TABLE load(value text)")
load_stop_event = threading.Event()
load_ok_event = threading.Event()
load_thread = threading.Thread(target=load,
args=(tenant_pg, load_stop_event, load_ok_event))
load_thread.start()
# run checkpoint manually to be sure that data landed in remote storage
with closing(env.pageserver.connect()) as psconn:
with psconn.cursor() as pscur:
pscur.execute(f"do_gc {tenant} {timeline}")
# ensure upload is completed
pageserver_http_client = env.pageserver.http_client()
timeline_detail = pageserver_http_client.timeline_detail(UUID(tenant), UUID(timeline))
assert timeline_detail['disk_consistent_lsn'] == timeline_detail['timeline_state']['Ready']
log.info("inititalizing new pageserver")
# bootstrap second pageserver
new_pageserver_dir = env.repo_dir / 'new_pageserver'
new_pageserver_dir.mkdir()
new_pageserver_pg_port = port_distributor.get_port()
new_pageserver_http_port = port_distributor.get_port()
log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port)
pageserver_bin = pathlib.Path(zenith_binpath) / 'pageserver'
new_pageserver_http_client = ZenithPageserverHttpClient(port=new_pageserver_http_port,
auth_token=None)
with new_pageserver_helper(new_pageserver_dir,
pageserver_bin,
remote_storage_mock_path,
new_pageserver_pg_port,
new_pageserver_http_port):
# call to attach timeline to new pageserver
new_pageserver_http_client.timeline_attach(UUID(tenant), UUID(timeline))
# FIXME cannot handle duplicate download requests, subject to fix in https://github.com/zenithdb/zenith/issues/997
time.sleep(5)
# new pageserver should in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint
new_timeline_detail = wait_for(
number_of_iterations=5,
interval=1,
func=lambda: assert_local(new_pageserver_http_client, tenant, timeline))
assert new_timeline_detail['timeline_state'].get('Ready'), new_timeline_detail
# when load is active these checks can break because lsns are not static
# so lets check with some margin
if with_load == 'without_load':
# TODO revisit this once https://github.com/zenithdb/zenith/issues/1049 is fixed
assert_abs_margin_ratio(new_timeline_detail['disk_consistent_lsn'],
timeline_detail['disk_consistent_lsn'],
0.01)
assert_abs_margin_ratio(new_timeline_detail['timeline_state']['Ready'],
timeline_detail['timeline_state']['Ready'],
0.01)
# callmemaybe to start replication from safekeeper to the new pageserver
# when there is no load there is a clean checkpoint and no wal delta
# needs to be streamed to the new pageserver
# TODO (rodionov) use attach to start replication
with pg_cur(PgProtocol(host='localhost', port=new_pageserver_pg_port)) as cur:
# "callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={}'"
safekeeper_connstring = f"host=localhost port={env.safekeepers[0].port.pg} options='-c ztimelineid={timeline} ztenantid={tenant} pageserver_connstr=postgresql://no_user:@localhost:{new_pageserver_pg_port}'"
cur.execute("callmemaybe {} {} {}".format(tenant, timeline, safekeeper_connstring))
tenant_pg.stop()
# rewrite zenith cli config to use new pageserver for basebackup to start new compute
cli_config_lines = (env.repo_dir / 'config').read_text().splitlines()
cli_config_lines[-2] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'"
cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'"
(env.repo_dir / 'config').write_text('\n'.join(cli_config_lines))
tenant_pg_config_file_path = pathlib.Path(tenant_pg.config_file_path())
tenant_pg_config_file_path.open('a').write(
f"\nzenith.page_server_connstring = 'postgresql://no_user:@localhost:{new_pageserver_pg_port}'"
)
tenant_pg.start()
# detach tenant from old pageserver before we check
# that all the data is there to be sure that old pageserver
# is no longer involved, and if it is, we will see the errors
pageserver_http_client.timeline_detach(UUID(tenant), UUID(timeline))
with pg_cur(tenant_pg) as cur:
# check that data is still there
cur.execute("SELECT sum(key) FROM t")
assert cur.fetchone() == (500500, )
# check that we can write new data
cur.execute("INSERT INTO t SELECT generate_series(1001,2000), 'some payload'")
cur.execute("SELECT sum(key) FROM t")
assert cur.fetchone() == (2001000, )
if with_load == 'with_load':
assert load_ok_event.wait(1)
log.info('stopping load thread')
load_stop_event.set()
load_thread.join()
log.info('load thread stopped')
# bring old pageserver back for clean shutdown via zenith cli
# new pageserver will be shut down by the context manager
cli_config_lines = (env.repo_dir / 'config').read_text().splitlines()
cli_config_lines[-2] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'"
cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'"
(env.repo_dir / 'config').write_text('\n'.join(cli_config_lines))

View File

@@ -80,8 +80,8 @@ def test_twophase(zenith_simple_env: ZenithEnv):
cur2.execute("ROLLBACK PREPARED 'insert_two'") cur2.execute("ROLLBACK PREPARED 'insert_two'")
cur2.execute('SELECT * FROM foo') cur2.execute('SELECT * FROM foo')
assert cur2.fetchall() == [('one', ), ('three', )] assert cur2.fetchall() == [('one', ), ('three', )] # type: ignore[comparison-overlap]
# Only one committed insert is visible on the original branch # Only one committed insert is visible on the original branch
cur.execute('SELECT * FROM foo') cur.execute('SELECT * FROM foo')
assert cur.fetchall() == [('three', )] assert cur.fetchall() == [('three', )] # type: ignore[comparison-overlap]

View File

@@ -12,7 +12,7 @@ from contextlib import closing
from dataclasses import dataclass, field from dataclasses import dataclass, field
from multiprocessing import Process, Value from multiprocessing import Process, Value
from pathlib import Path from pathlib import Path
from fixtures.zenith_fixtures import PgBin, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol from fixtures.zenith_fixtures import PgBin, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol
from fixtures.utils import lsn_to_hex, mkdir_if_needed from fixtures.utils import lsn_to_hex, mkdir_if_needed
from fixtures.log_helper import log from fixtures.log_helper import log
from typing import List, Optional, Any from typing import List, Optional, Any
@@ -603,3 +603,92 @@ def test_safekeeper_without_pageserver(test_output_dir: str,
env.postgres.safe_psql("insert into t select generate_series(1, 100)") env.postgres.safe_psql("insert into t select generate_series(1, 100)")
res = env.postgres.safe_psql("select sum(i) from t")[0][0] res = env.postgres.safe_psql("select sum(i) from t")[0][0]
assert res == 5050 assert res == 5050
def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder):
def safekeepers_guc(env: ZenithEnv, sk_names: List[str]) -> str:
return ','.join(
[f'localhost:{sk.port.pg}' for sk in env.safekeepers if sk.name in sk_names])
def execute_payload(pg: Postgres):
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
# we rely upon autocommit after each statement
# as waiting for acceptors happens there
cur.execute('CREATE TABLE IF NOT EXISTS t(key int, value text)')
cur.execute("INSERT INTO t VALUES (0, 'something')")
cur.execute('SELECT SUM(key) FROM t')
sum_before = cur.fetchone()[0]
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
cur.execute('SELECT SUM(key) FROM t')
sum_after = cur.fetchone()[0]
assert sum_after == sum_before + 5000050000
def show_statuses(safekeepers: List[Safekeeper], tenant_id: str, timeline_id: str):
for sk in safekeepers:
http_cli = sk.http_client()
try:
status = http_cli.timeline_status(tenant_id, timeline_id)
log.info(f"Safekeeper {sk.name} status: {status}")
except Exception as e:
log.info(f"Safekeeper {sk.name} status error: {e}")
zenith_env_builder.num_safekeepers = 4
env = zenith_env_builder.init()
env.zenith_cli(["branch", "test_replace_safekeeper", "main"])
log.info("Use only first 3 safekeepers")
env.safekeepers[3].stop()
active_safekeepers = ['sk1', 'sk2', 'sk3']
pg = env.postgres.create('test_replace_safekeeper')
pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers))
pg.start()
# learn zenith timeline from compute
tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]
execute_payload(pg)
show_statuses(env.safekeepers, tenant_id, timeline_id)
log.info("Restart all safekeepers to flush everything")
env.safekeepers[0].stop(immediate=True)
execute_payload(pg)
env.safekeepers[0].start()
env.safekeepers[1].stop(immediate=True)
execute_payload(pg)
env.safekeepers[1].start()
env.safekeepers[2].stop(immediate=True)
execute_payload(pg)
env.safekeepers[2].start()
env.safekeepers[0].stop(immediate=True)
env.safekeepers[1].stop(immediate=True)
env.safekeepers[2].stop(immediate=True)
env.safekeepers[0].start()
env.safekeepers[1].start()
env.safekeepers[2].start()
execute_payload(pg)
show_statuses(env.safekeepers, tenant_id, timeline_id)
log.info("Stop sk1 (simulate failure) and use only quorum of sk2 and sk3")
env.safekeepers[0].stop(immediate=True)
execute_payload(pg)
show_statuses(env.safekeepers, tenant_id, timeline_id)
log.info("Recreate postgres to replace failed sk1 with new sk4")
pg.stop_and_destroy().create('test_replace_safekeeper')
active_safekeepers = ['sk2', 'sk3', 'sk4']
env.safekeepers[3].start()
pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers))
pg.start()
execute_payload(pg)
show_statuses(env.safekeepers, tenant_id, timeline_id)
log.info("Stop sk2 to require quorum of sk3 and sk4 for normal work")
env.safekeepers[1].stop(immediate=True)
execute_payload(pg)
show_statuses(env.safekeepers, tenant_id, timeline_id)

View File

@@ -3,11 +3,11 @@ import logging.config
""" """
This file configures logging to use in python tests. This file configures logging to use in python tests.
Logs are automatically captured and shown in their Logs are automatically captured and shown in their
own section after all tests are executed. own section after all tests are executed.
To see logs for all (even successful) tests, run To see logs for all (even successful) tests, run
pytest with the following command: pytest with the following command:
- `pipenv run pytest -n8 -rA` - `poetry run pytest -n8 -rA`
Other log config can be set in pytest.ini file. Other log config can be set in pytest.ini file.
You can add `log_cli = true` to it to watch You can add `log_cli = true` to it to watch
@@ -34,7 +34,7 @@ LOGGING = {
def getLogger(name='root') -> logging.Logger: def getLogger(name='root') -> logging.Logger:
"""Method to get logger for tests. """Method to get logger for tests.
Should be used to get correctly initialized logger. """ Should be used to get correctly initialized logger. """
return logging.getLogger(name) return logging.getLogger(name)

View File

@@ -4,6 +4,7 @@ from dataclasses import dataclass, field
from cached_property import cached_property from cached_property import cached_property
import asyncpg import asyncpg
import os import os
import boto3
import pathlib import pathlib
import uuid import uuid
import warnings import warnings
@@ -25,7 +26,7 @@ from dataclasses import dataclass
# Type-related stuff # Type-related stuff
from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import connection as PgConnection
from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union
from typing_extensions import Literal from typing_extensions import Literal
import pytest import pytest
@@ -330,6 +331,48 @@ class AuthKeys:
return token return token
class MockS3Server:
"""
Starts a mock S3 server for testing on a port given, errors if the server fails to start or exits prematurely.
Relies that `poetry` and `moto` server are installed, since it's the way the tests are run.
Also provides a set of methods to derive the connection properties from and the method to kill the underlying server.
"""
def __init__(
self,
port: int,
):
self.port = port
self.subprocess = subprocess.Popen([f'poetry run moto_server s3 -p{port}'], shell=True)
error = None
try:
return_code = self.subprocess.poll()
if return_code is not None:
error = f"expected mock s3 server to run but it exited with code {return_code}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
except Exception as e:
error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
if error is not None:
log.error(error)
self.subprocess.kill()
raise RuntimeError("failed to start s3 mock server")
def endpoint(self) -> str:
return f"http://127.0.0.1:{self.port}"
def region(self) -> str:
return 'us-east-1'
def access_key(self) -> str:
return 'test'
def secret_key(self) -> str:
return 'test'
def kill(self):
self.subprocess.kill()
class ZenithEnvBuilder: class ZenithEnvBuilder:
""" """
Builder object to create a Zenith runtime environment Builder object to create a Zenith runtime environment
@@ -342,20 +385,69 @@ class ZenithEnvBuilder:
def __init__(self, def __init__(self,
repo_dir: Path, repo_dir: Path,
port_distributor: PortDistributor, port_distributor: PortDistributor,
pageserver_remote_storage: Optional[RemoteStorage] = None,
num_safekeepers: int = 0, num_safekeepers: int = 0,
pageserver_auth_enabled: bool = False): pageserver_auth_enabled: bool = False,
rust_log_override: Optional[str] = None):
self.repo_dir = repo_dir self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
self.port_distributor = port_distributor self.port_distributor = port_distributor
self.pageserver_remote_storage = pageserver_remote_storage
self.num_safekeepers = num_safekeepers self.num_safekeepers = num_safekeepers
self.pageserver_auth_enabled = pageserver_auth_enabled self.pageserver_auth_enabled = pageserver_auth_enabled
self.env: Optional[ZenithEnv] = None self.env: Optional[ZenithEnv] = None
self.s3_mock_server: Optional[MockS3Server] = None
if os.getenv('FORCE_MOCK_S3') is not None:
bucket_name = f'{repo_dir.name}_bucket'
log.warning(f'Unconditionally initializing mock S3 server for bucket {bucket_name}')
self.enable_s3_mock_remote_storage(bucket_name)
def init(self) -> ZenithEnv: def init(self) -> ZenithEnv:
# Cannot create more than one environment from one builder # Cannot create more than one environment from one builder
assert self.env is None, "environment already initialized" assert self.env is None, "environment already initialized"
self.env = ZenithEnv(self) self.env = ZenithEnv(self)
return self.env return self.env
"""
Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path.
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
"""
def enable_local_fs_remote_storage(self, force_enable=True):
assert force_enable or self.pageserver_remote_storage is None, "remote storage is enabled already"
self.pageserver_remote_storage = LocalFsStorage(
Path(self.repo_dir / 'local_fs_remote_storage'))
"""
Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
Starts up the mock server, if that does not run yet.
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
"""
def enable_s3_mock_remote_storage(self, bucket_name: str, force_enable=True):
assert force_enable or self.pageserver_remote_storage is None, "remote storage is enabled already"
if not self.s3_mock_server:
self.s3_mock_server = MockS3Server(self.port_distributor.get_port())
mock_endpoint = self.s3_mock_server.endpoint()
mock_region = self.s3_mock_server.region()
mock_access_key = self.s3_mock_server.access_key()
mock_secret_key = self.s3_mock_server.secret_key()
boto3.client(
's3',
endpoint_url=mock_endpoint,
region_name=mock_region,
aws_access_key_id=mock_access_key,
aws_secret_access_key=mock_secret_key,
).create_bucket(Bucket=bucket_name)
self.pageserver_remote_storage = S3Storage(bucket=bucket_name,
endpoint=mock_endpoint,
region=mock_region,
access_key=mock_access_key,
secret_key=mock_secret_key)
def __enter__(self): def __enter__(self):
return self return self
@@ -368,6 +460,8 @@ class ZenithEnvBuilder:
for sk in self.env.safekeepers: for sk in self.env.safekeepers:
sk.stop(immediate=True) sk.stop(immediate=True)
self.env.pageserver.stop(immediate=True) self.env.pageserver.stop(immediate=True)
if self.s3_mock_server:
self.s3_mock_server.kill()
class ZenithEnv: class ZenithEnv:
@@ -404,7 +498,9 @@ class ZenithEnv:
""" """
def __init__(self, config: ZenithEnvBuilder): def __init__(self, config: ZenithEnvBuilder):
self.repo_dir = config.repo_dir self.repo_dir = config.repo_dir
self.rust_log_override = config.rust_log_override
self.port_distributor = config.port_distributor self.port_distributor = config.port_distributor
self.s3_mock_server = config.s3_mock_server
self.postgres = PostgresFactory(self) self.postgres = PostgresFactory(self)
@@ -434,7 +530,9 @@ auth_type = '{pageserver_auth_type}'
""" """
# Create a corresponding ZenithPageserver object # Create a corresponding ZenithPageserver object
self.pageserver = ZenithPageserver(self, port=pageserver_port) self.pageserver = ZenithPageserver(self,
port=pageserver_port,
remote_storage=config.pageserver_remote_storage)
# Create config and a Safekeeper object for each safekeeper # Create config and a Safekeeper object for each safekeeper
for i in range(1, config.num_safekeepers + 1): for i in range(1, config.num_safekeepers + 1):
@@ -465,6 +563,8 @@ sync = false # Disable fsyncs to make the tests go faster
tmp.flush() tmp.flush()
cmd = ['init', f'--config={tmp.name}'] cmd = ['init', f'--config={tmp.name}']
append_pageserver_param_overrides(cmd, config.pageserver_remote_storage)
self.zenith_cli(cmd) self.zenith_cli(cmd)
# Start up the page server and all the safekeepers # Start up the page server and all the safekeepers
@@ -509,6 +609,9 @@ sync = false # Disable fsyncs to make the tests go faster
env_vars['ZENITH_REPO_DIR'] = str(self.repo_dir) env_vars['ZENITH_REPO_DIR'] = str(self.repo_dir)
env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir) env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir)
if self.rust_log_override is not None:
env_vars['RUST_LOG'] = self.rust_log_override
# Pass coverage settings # Pass coverage settings
var = 'LLVM_PROFILE_FILE' var = 'LLVM_PROFILE_FILE'
val = os.environ.get(var) val = os.environ.get(var)
@@ -583,6 +686,8 @@ def zenith_simple_env(_shared_simple_env: ZenithEnv) -> Iterator[ZenithEnv]:
yield _shared_simple_env yield _shared_simple_env
_shared_simple_env.postgres.stop_all() _shared_simple_env.postgres.stop_all()
if _shared_simple_env.s3_mock_server:
_shared_simple_env.s3_mock_server.kill()
@pytest.fixture(scope='function') @pytest.fixture(scope='function')
@@ -620,6 +725,16 @@ class ZenithPageserverHttpClient(requests.Session):
def check_status(self): def check_status(self):
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
def timeline_attach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID):
res = self.post(
f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/attach", )
res.raise_for_status()
def timeline_detach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID):
res = self.post(
f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/detach", )
res.raise_for_status()
def branch_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: def branch_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]:
res = self.get(f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}") res = self.get(f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}")
res.raise_for_status() res.raise_for_status()
@@ -665,6 +780,21 @@ class ZenithPageserverHttpClient(requests.Session):
res.raise_for_status() res.raise_for_status()
return res.json() return res.json()
def timeline_list(self, tenant_id: uuid.UUID) -> List[str]:
res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}")
res.raise_for_status()
res_json = res.json()
assert isinstance(res_json, list)
return res_json
def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID):
res = self.get(
f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}")
res.raise_for_status()
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
def get_metrics(self) -> str: def get_metrics(self) -> str:
res = self.get(f"http://localhost:{self.port}/metrics") res = self.get(f"http://localhost:{self.port}/metrics")
res.raise_for_status() res.raise_for_status()
@@ -677,17 +807,39 @@ class PageserverPort:
http: int http: int
@dataclass
class LocalFsStorage:
root: Path
@dataclass
class S3Storage:
bucket: str
region: str
access_key: Optional[str]
secret_key: Optional[str]
endpoint: Optional[str]
RemoteStorage = Union[LocalFsStorage, S3Storage]
class ZenithPageserver(PgProtocol): class ZenithPageserver(PgProtocol):
""" """
An object representing a running pageserver. An object representing a running pageserver.
Initializes the repository via `zenith init`. Initializes the repository via `zenith init`.
""" """
def __init__(self, env: ZenithEnv, port: PageserverPort, enable_auth=False): def __init__(self,
env: ZenithEnv,
port: PageserverPort,
remote_storage: Optional[RemoteStorage] = None,
enable_auth=False):
super().__init__(host='localhost', port=port.pg) super().__init__(host='localhost', port=port.pg)
self.env = env self.env = env
self.running = False self.running = False
self.service_port = port # do not shadow PgProtocol.port which is just int self.service_port = port # do not shadow PgProtocol.port which is just int
self.remote_storage = remote_storage
def start(self) -> 'ZenithPageserver': def start(self) -> 'ZenithPageserver':
""" """
@@ -696,7 +848,10 @@ class ZenithPageserver(PgProtocol):
""" """
assert self.running == False assert self.running == False
self.env.zenith_cli(['pageserver', 'start']) start_args = ['pageserver', 'start']
append_pageserver_param_overrides(start_args, self.remote_storage)
self.env.zenith_cli(start_args)
self.running = True self.running = True
return self return self
@@ -729,6 +884,34 @@ class ZenithPageserver(PgProtocol):
) )
def append_pageserver_param_overrides(params_to_update: List[str],
pageserver_remote_storage: Optional[RemoteStorage]):
if pageserver_remote_storage is not None:
if isinstance(pageserver_remote_storage, LocalFsStorage):
pageserver_storage_override = f"local_path='{pageserver_remote_storage.root}'"
elif isinstance(pageserver_remote_storage, S3Storage):
pageserver_storage_override = f"bucket_name='{pageserver_remote_storage.bucket}',\
bucket_region='{pageserver_remote_storage.region}'"
if pageserver_remote_storage.access_key is not None:
pageserver_storage_override += f",access_key_id='{pageserver_remote_storage.access_key}'"
if pageserver_remote_storage.secret_key is not None:
pageserver_storage_override += f",secret_access_key='{pageserver_remote_storage.secret_key}'"
if pageserver_remote_storage.endpoint is not None:
pageserver_storage_override += f",endpoint='{pageserver_remote_storage.endpoint}'"
else:
raise Exception(f'Unknown storage configuration {pageserver_remote_storage}')
params_to_update.append(
f'--pageserver-config-override=remote_storage={{{pageserver_storage_override}}}')
env_overrides = os.getenv('ZENITH_PAGESERVER_OVERRIDES')
if env_overrides is not None:
params_to_update += [
f'--pageserver-config-override={o.strip()}' for o in env_overrides.split(';')
]
class PgBin: class PgBin:
""" A helper class for executing postgres binaries """ """ A helper class for executing postgres binaries """
def __init__(self, log_dir: str): def __init__(self, log_dir: str):
@@ -831,8 +1014,6 @@ class Postgres(PgProtocol):
path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.node_name path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.node_name
self.pgdata_dir = os.path.join(self.env.repo_dir, path) self.pgdata_dir = os.path.join(self.env.repo_dir, path)
if self.env.safekeepers:
self.adjust_for_wal_acceptors(self.env.get_safekeeper_connstrs())
if config_lines is None: if config_lines is None:
config_lines = [] config_lines = []
self.config(config_lines) self.config(config_lines)
@@ -889,7 +1070,9 @@ class Postgres(PgProtocol):
# walproposer uses different application_name # walproposer uses different application_name
if ("synchronous_standby_names" in cfg_line or if ("synchronous_standby_names" in cfg_line or
# don't ask pageserver to fetch WAL from compute # don't ask pageserver to fetch WAL from compute
"callmemaybe_connstring" in cfg_line): "callmemaybe_connstring" in cfg_line or
# don't repeat wal_acceptors multiple times
"wal_acceptors" in cfg_line):
continue continue
f.write(cfg_line) f.write(cfg_line)
f.write("synchronous_standby_names = 'walproposer'\n") f.write("synchronous_standby_names = 'walproposer'\n")

View File

@@ -2,7 +2,7 @@
name = "walkeeper" name = "walkeeper"
version = "0.1.0" version = "0.1.0"
authors = ["Stas Kelvich <stas@zenith.tech>"] authors = ["Stas Kelvich <stas@zenith.tech>"]
edition = "2018" edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -19,9 +19,9 @@ tracing = "0.1.27"
clap = "2.33.0" clap = "2.33.0"
daemonize = "0.4.1" daemonize = "0.4.1"
rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] } rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] }
tokio = "1.11" tokio = { version = "1.11", features = ["macros"] }
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
anyhow = "1.0" anyhow = "1.0"
crc32c = "0.6.0" crc32c = "0.6.0"
humantime = "2.1.0" humantime = "2.1.0"
@@ -30,7 +30,7 @@ signal-hook = "0.3.10"
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
hex = "0.4.3" hex = "0.4.3"
const_format = "0.2.21" const_format = "0.2.21"
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
postgres_ffi = { path = "../postgres_ffi" } postgres_ffi = { path = "../postgres_ffi" }
workspace_hack = { path = "../workspace_hack" } workspace_hack = { path = "../workspace_hack" }

View File

@@ -99,10 +99,7 @@ A: If the compute node has evicted a page, changes to it have been WAL-logged
Q: How long may Page Server wait for? Q: How long may Page Server wait for?
A: Not too long, hopefully. If a page is evicted, it probably was not used for A: Not too long, hopefully. If a page is evicted, it probably was not used for
a while, so the WAL service have had enough time to push changes to the Page a while, so the WAL service have had enough time to push changes to the Page
Server. There may be issues if there is no backpressure and compute node with Server. To limit the lag, tune backpressure using `max_replication_*_lag` settings.
WAL service run ahead of Page Server, though.
There is no backpressure right now, so you may even see some spurious
timeouts in tests.
Q: How do WAL safekeepers communicate with each other? Q: How do WAL safekeepers communicate with each other?
A: They may only send each other messages via the compute node, they never A: They may only send each other messages via the compute node, they never

View File

@@ -10,6 +10,7 @@ use std::fs::File;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::thread; use std::thread;
use tracing::*; use tracing::*;
use walkeeper::timeline::{CreateControlFile, FileStorage};
use zenith_utils::http::endpoint; use zenith_utils::http::endpoint;
use zenith_utils::{logging, tcp_listener, GIT_VERSION}; use zenith_utils::{logging, tcp_listener, GIT_VERSION};
@@ -86,8 +87,21 @@ fn main() -> Result<()> {
.takes_value(false) .takes_value(false)
.help("Do not wait for changes to be written safely to disk"), .help("Do not wait for changes to be written safely to disk"),
) )
.arg(
Arg::with_name("dump-control-file")
.long("dump-control-file")
.takes_value(true)
.help("Dump control file at path specifed by this argument and exit"),
)
.get_matches(); .get_matches();
if let Some(addr) = arg_matches.value_of("dump-control-file") {
let state = FileStorage::load_control_file(Path::new(addr), CreateControlFile::False)?;
let json = serde_json::to_string(&state)?;
print!("{}", json);
return Ok(());
}
let mut conf: SafeKeeperConf = Default::default(); let mut conf: SafeKeeperConf = Default::default();
if let Some(dir) = arg_matches.value_of("datadir") { if let Some(dir) = arg_matches.value_of("datadir") {
@@ -129,7 +143,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
// Prevent running multiple safekeepers on the same directory // Prevent running multiple safekeepers on the same directory
let lock_file_path = conf.workdir.join(LOCK_FILE_NAME); let lock_file_path = conf.workdir.join(LOCK_FILE_NAME);
let lock_file = File::create(&lock_file_path).with_context(|| "failed to open lockfile")?; let lock_file = File::create(&lock_file_path).context("failed to open lockfile")?;
lock_file.try_lock_exclusive().with_context(|| { lock_file.try_lock_exclusive().with_context(|| {
format!( format!(
"control file {} is locked by some other process", "control file {} is locked by some other process",
@@ -183,7 +197,12 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
.spawn(|| { .spawn(|| {
// TODO authentication // TODO authentication
let router = http::make_router(conf_); let router = http::make_router(conf_);
endpoint::serve_thread_main(router, http_listener).unwrap(); endpoint::serve_thread_main(
router,
http_listener,
std::future::pending(), // never shut down
)
.unwrap();
})?, })?,
); );

View File

@@ -6,8 +6,8 @@
//! from the call list. //! from the call list.
//! //!
use crate::SafeKeeperConf; use crate::SafeKeeperConf;
use anyhow::anyhow; use anyhow::{Context, Result};
use anyhow::Result; use std::collections::hash_map::Entry;
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Mutex; use std::sync::Mutex;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
@@ -42,9 +42,12 @@ async fn request_callback(
let me_conf: postgres::config::Config = me_connstr.parse().unwrap(); let me_conf: postgres::config::Config = me_connstr.parse().unwrap();
let (host, port) = connection_host_port(&me_conf); let (host, port) = connection_host_port(&me_conf);
// pageserver connstr is needed to be able to distinguish between different pageservers
// it is required to correctly manage callmemaybe subscriptions when more than one pageserver is involved
// TODO it is better to use some sort of a unique id instead of connection string, see https://github.com/zenithdb/zenith/issues/1105
let callme = format!( let callme = format!(
"callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={}'", "callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={} pageserver_connstr={}'",
tenantid, timelineid, host, port, timelineid, tenantid tenantid, timelineid, host, port, timelineid, tenantid, pageserver_connstr,
); );
let _ = client.simple_query(&callme).await?; let _ = client.simple_query(&callme).await?;
@@ -61,18 +64,36 @@ pub fn thread_main(conf: SafeKeeperConf, rx: UnboundedReceiver<CallmeEvent>) ->
runtime.block_on(main_loop(conf, rx)) runtime.block_on(main_loop(conf, rx))
} }
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
pub struct SubscriptionStateKey {
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
pageserver_connstr: String,
}
impl SubscriptionStateKey {
pub fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId, pageserver_connstr: String) -> Self {
Self {
tenant_id,
timeline_id,
pageserver_connstr,
}
}
}
/// Messages to the callmemaybe thread /// Messages to the callmemaybe thread
#[derive(Debug)] #[derive(Debug)]
pub enum CallmeEvent { pub enum CallmeEvent {
// add new subscription to the list // add new subscription to the list
Subscribe(ZTenantId, ZTimelineId, String), Subscribe(SubscriptionStateKey),
// remove the subscription from the list // remove the subscription from the list
Unsubscribe(ZTenantId, ZTimelineId), Unsubscribe(SubscriptionStateKey),
// don't serve this subscription, but keep it in the list // don't serve this subscription, but keep it in the list
Pause(ZTenantId, ZTimelineId), Pause(SubscriptionStateKey),
// resume this subscription, if it exists, // resume this subscription, if it exists,
// but don't create a new one if it is gone // but don't create a new one if it is gone
Resume(ZTenantId, ZTimelineId), Resume(SubscriptionStateKey),
// TODO how do we delete from subscriptions?
} }
#[derive(Debug)] #[derive(Debug)]
@@ -118,6 +139,7 @@ impl SubscriptionState {
let timelineid = self.timelineid; let timelineid = self.timelineid;
let tenantid = self.tenantid; let tenantid = self.tenantid;
let pageserver_connstr = self.pageserver_connstr.clone();
tokio::spawn(async move { tokio::spawn(async move {
if let Err(err) = handle.await { if let Err(err) = handle.await {
if err.is_cancelled() { if err.is_cancelled() {
@@ -125,8 +147,8 @@ impl SubscriptionState {
timelineid, tenantid); timelineid, tenantid);
} else { } else {
error!( error!(
"callback task for timelineid={} tenantid={} failed: {}", "callback task for timelineid={} tenantid={} pageserver_connstr={} failed: {}",
timelineid, tenantid, err timelineid, tenantid, pageserver_connstr, err
); );
} }
} }
@@ -138,7 +160,7 @@ impl SubscriptionState {
// Ignore call request if this subscription is paused // Ignore call request if this subscription is paused
if self.paused { if self.paused {
debug!( debug!(
"ignore call request for paused subscription "ignore call request for paused subscription \
tenantid: {}, timelineid: {}", tenantid: {}, timelineid: {}",
self.tenantid, self.timelineid self.tenantid, self.timelineid
); );
@@ -148,7 +170,7 @@ impl SubscriptionState {
// Check if it too early to recall // Check if it too early to recall
if self.handle.is_some() && self.last_call_time.elapsed() < recall_period { if self.handle.is_some() && self.last_call_time.elapsed() < recall_period {
debug!( debug!(
"too early to recall. self.last_call_time.elapsed: {:?}, recall_period: {:?} "too early to recall. self.last_call_time.elapsed: {:?}, recall_period: {:?} \
tenantid: {}, timelineid: {}", tenantid: {}, timelineid: {}",
self.last_call_time, recall_period, self.tenantid, self.timelineid self.last_call_time, recall_period, self.tenantid, self.timelineid
); );
@@ -176,8 +198,7 @@ impl SubscriptionState {
// Update last_call_time // Update last_call_time
self.last_call_time = Instant::now(); self.last_call_time = Instant::now();
info!( info!(
"new call spawned. time {:?} "new call spawned. last call time {:?} tenantid: {}, timelineid: {}",
tenantid: {}, timelineid: {}",
self.last_call_time, self.tenantid, self.timelineid self.last_call_time, self.tenantid, self.timelineid
); );
} }
@@ -190,7 +211,7 @@ impl Drop for SubscriptionState {
} }
pub async fn main_loop(conf: SafeKeeperConf, mut rx: UnboundedReceiver<CallmeEvent>) -> Result<()> { pub async fn main_loop(conf: SafeKeeperConf, mut rx: UnboundedReceiver<CallmeEvent>) -> Result<()> {
let subscriptions: Mutex<HashMap<(ZTenantId, ZTimelineId), SubscriptionState>> = let subscriptions: Mutex<HashMap<SubscriptionStateKey, SubscriptionState>> =
Mutex::new(HashMap::new()); Mutex::new(HashMap::new());
let mut ticker = tokio::time::interval(conf.recall_period); let mut ticker = tokio::time::interval(conf.recall_period);
@@ -198,54 +219,82 @@ pub async fn main_loop(conf: SafeKeeperConf, mut rx: UnboundedReceiver<CallmeEve
tokio::select! { tokio::select! {
request = rx.recv() => request = rx.recv() =>
{ {
match request.ok_or_else(|| anyhow!("done"))? match request.context("done")?
{ {
CallmeEvent::Subscribe(tenantid, timelineid, pageserver_connstr) => CallmeEvent::Subscribe(key) =>
{ {
let _enter = info_span!("callmemaybe: subscribe", timelineid = %key.timeline_id, tenantid = %key.tenant_id, pageserver_connstr=%key.pageserver_connstr.clone()).entered();
let mut subscriptions = subscriptions.lock().unwrap(); let mut subscriptions = subscriptions.lock().unwrap();
if let Some(sub) = subscriptions.get(&(tenantid, timelineid)) // XXX this clone is ugly, is there a way to use the trick with Borrow trait with entry API?
{ // when we switch to node id instead of the connection string key will be Copy and there will be no need to clone
info!("callmemaybe. subscription already exists {:?}", sub); match subscriptions.entry(key.clone()) {
Entry::Occupied(_) => {
// Do nothing if subscription already exists
// If it is paused it means that there is already established replication connection.
// If it is not paused it will be polled with other subscriptions when timeout expires.
// This can occur when replication channel is established before subscription is added.
info!(
"subscription already exists",
);
}
Entry::Vacant(entry) => {
let subscription = entry.insert(SubscriptionState::new(
key.tenant_id,
key.timeline_id,
key.pageserver_connstr,
));
subscription.call(conf.recall_period, conf.listen_pg_addr.clone());
}
} }
if let Some(mut sub) = subscriptions.insert((tenantid, timelineid), },
SubscriptionState::new(tenantid, timelineid, pageserver_connstr)) CallmeEvent::Unsubscribe(key) => {
{ let _enter = debug_span!("callmemaybe: unsubscribe", timelineid = %key.timeline_id, tenantid = %key.tenant_id, pageserver_connstr=%key.pageserver_connstr.clone()).entered();
sub.call(conf.recall_period, conf.listen_pg_addr.clone()); debug!("unsubscribe");
let mut subscriptions = subscriptions.lock().unwrap();
subscriptions.remove(&key);
},
CallmeEvent::Pause(key) => {
let _enter = debug_span!("callmemaybe: pause", timelineid = %key.timeline_id, tenantid = %key.tenant_id, pageserver_connstr=%key.pageserver_connstr.clone()).entered();
let mut subscriptions = subscriptions.lock().unwrap();
// If pause received when no corresponding subscription exists it means that someone started replication
// without using callmemaybe. So we create subscription and pause it.
// In tenant relocation scenario subscribe call will be executed after pause when compute is restarted.
// In that case there is no need to create new/unpause existing subscription.
match subscriptions.entry(key.clone()) {
Entry::Occupied(mut sub) => {
debug!("pause existing");
sub.get_mut().pause();
}
Entry::Vacant(entry) => {
debug!("create paused");
let subscription = entry.insert(SubscriptionState::new(
key.tenant_id,
key.timeline_id,
key.pageserver_connstr,
));
subscription.pause();
}
} }
info!("callmemaybe. thread_main. subscribe callback request for timelineid={} tenantid={}",
timelineid, tenantid);
}, },
CallmeEvent::Unsubscribe(tenantid, timelineid) => { CallmeEvent::Resume(key) => {
debug!(
"callmemaybe. thread_main. resume callback request for timelineid={} tenantid={} pageserver_connstr={}",
key.timeline_id, key.tenant_id, key.pageserver_connstr,
);
let mut subscriptions = subscriptions.lock().unwrap(); let mut subscriptions = subscriptions.lock().unwrap();
subscriptions.remove(&(tenantid, timelineid)); if let Some(sub) = subscriptions.get_mut(&key)
info!("callmemaybe. thread_main. unsubscribe callback. request for timelineid={} tenantid={}",
timelineid, tenantid);
},
CallmeEvent::Pause(tenantid, timelineid) => {
let mut subscriptions = subscriptions.lock().unwrap();
if let Some(sub) = subscriptions.get_mut(&(tenantid, timelineid))
{
sub.pause();
};
info!("callmemaybe. thread_main. pause callback request for timelineid={} tenantid={}",
timelineid, tenantid);
},
CallmeEvent::Resume(tenantid, timelineid) => {
let mut subscriptions = subscriptions.lock().unwrap();
if let Some(sub) = subscriptions.get_mut(&(tenantid, timelineid))
{ {
sub.resume(); sub.resume();
}; };
info!("callmemaybe. thread_main. resume callback request for timelineid={} tenantid={}",
timelineid, tenantid);
}, },
} }
}, },
_ = ticker.tick() => { _ = ticker.tick() => {
let _enter = debug_span!("callmemaybe: tick").entered();
let mut subscriptions = subscriptions.lock().unwrap(); let mut subscriptions = subscriptions.lock().unwrap();
for (&(_tenantid, _timelineid), state) in subscriptions.iter_mut() { for (_, state) in subscriptions.iter_mut() {
state.call(conf.recall_period, conf.listen_pg_addr.clone()); state.call(conf.recall_period, conf.listen_pg_addr.clone());
} }
}, },

View File

@@ -6,7 +6,7 @@ use crate::receive_wal::ReceiveWalConn;
use crate::send_wal::ReplicationConn; use crate::send_wal::ReplicationConn;
use crate::timeline::{Timeline, TimelineTools}; use crate::timeline::{Timeline, TimelineTools};
use crate::SafeKeeperConf; use crate::SafeKeeperConf;
use anyhow::{anyhow, bail, Context, Result}; use anyhow::{bail, Context, Result};
use postgres_ffi::xlog_utils::PG_TLI; use postgres_ffi::xlog_utils::PG_TLI;
use regex::Regex; use regex::Regex;
@@ -16,7 +16,7 @@ use zenith_utils::lsn::Lsn;
use zenith_utils::postgres_backend; use zenith_utils::postgres_backend;
use zenith_utils::postgres_backend::PostgresBackend; use zenith_utils::postgres_backend::PostgresBackend;
use zenith_utils::pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; use zenith_utils::pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
use zenith_utils::zid::{ZTenantId, ZTimelineId}; use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
use crate::callmemaybe::CallmeEvent; use crate::callmemaybe::CallmeEvent;
use crate::timeline::CreateControlFile; use crate::timeline::CreateControlFile;
@@ -30,6 +30,7 @@ pub struct SafekeeperPostgresHandler {
pub ztenantid: Option<ZTenantId>, pub ztenantid: Option<ZTenantId>,
pub ztimelineid: Option<ZTimelineId>, pub ztimelineid: Option<ZTimelineId>,
pub timeline: Option<Arc<Timeline>>, pub timeline: Option<Arc<Timeline>>,
pageserver_connstr: Option<String>,
//sender to communicate with callmemaybe thread //sender to communicate with callmemaybe thread
pub tx: UnboundedSender<CallmeEvent>, pub tx: UnboundedSender<CallmeEvent>,
} }
@@ -56,16 +57,15 @@ fn parse_cmd(cmd: &str) -> Result<SafekeeperPostgresCommand> {
let start_lsn = caps let start_lsn = caps
.next() .next()
.map(|cap| cap[1].parse::<Lsn>()) .map(|cap| cap[1].parse::<Lsn>())
.ok_or_else(|| anyhow!("failed to parse start LSN from START_REPLICATION command"))??; .context("failed to parse start LSN from START_REPLICATION command")??;
Ok(SafekeeperPostgresCommand::StartReplication { start_lsn }) Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
} else if cmd.starts_with("IDENTIFY_SYSTEM") { } else if cmd.starts_with("IDENTIFY_SYSTEM") {
Ok(SafekeeperPostgresCommand::IdentifySystem) Ok(SafekeeperPostgresCommand::IdentifySystem)
} else if cmd.starts_with("JSON_CTRL") { } else if cmd.starts_with("JSON_CTRL") {
let cmd = cmd let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?;
.strip_prefix("JSON_CTRL") Ok(SafekeeperPostgresCommand::JSONCtrl {
.ok_or_else(|| anyhow!("invalid prefix"))?; cmd: serde_json::from_str(cmd)?,
let parsed_cmd: AppendLogicalMessage = serde_json::from_str(cmd)?; })
Ok(SafekeeperPostgresCommand::JSONCtrl { cmd: parsed_cmd })
} else { } else {
bail!("unsupported command {}", cmd); bail!("unsupported command {}", cmd);
} }
@@ -89,6 +89,8 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
self.appname = Some(app_name.clone()); self.appname = Some(app_name.clone());
} }
self.pageserver_connstr = params.get("pageserver_connstr").cloned();
Ok(()) Ok(())
} else { } else {
bail!("Walkeeper received unexpected initial message: {:?}", sm); bail!("Walkeeper received unexpected initial message: {:?}", sm);
@@ -104,12 +106,8 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
| SafekeeperPostgresCommand::StartReplication { .. } | SafekeeperPostgresCommand::StartReplication { .. }
| SafekeeperPostgresCommand::IdentifySystem | SafekeeperPostgresCommand::IdentifySystem
| SafekeeperPostgresCommand::JSONCtrl { .. } => { | SafekeeperPostgresCommand::JSONCtrl { .. } => {
let tenantid = self let tenantid = self.ztenantid.context("tenantid is required")?;
.ztenantid let timelineid = self.ztimelineid.context("timelineid is required")?;
.ok_or_else(|| anyhow!("tenantid is required"))?;
let timelineid = self
.ztimelineid
.ok_or_else(|| anyhow!("timelineid is required"))?;
if self.timeline.is_none() { if self.timeline.is_none() {
// START_WAL_PUSH is the only command that initializes the timeline in production. // START_WAL_PUSH is the only command that initializes the timeline in production.
// There is also JSON_CTRL command, which should initialize the timeline for testing. // There is also JSON_CTRL command, which should initialize the timeline for testing.
@@ -118,8 +116,11 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
| SafekeeperPostgresCommand::JSONCtrl { .. } => CreateControlFile::True, | SafekeeperPostgresCommand::JSONCtrl { .. } => CreateControlFile::True,
_ => CreateControlFile::False, _ => CreateControlFile::False,
}; };
self.timeline self.timeline.set(
.set(&self.conf, tenantid, timelineid, create_control_file)?; &self.conf,
ZTenantTimelineId::new(tenantid, timelineid),
create_control_file,
)?;
} }
} }
} }
@@ -128,12 +129,12 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
SafekeeperPostgresCommand::StartWalPush { pageserver_connstr } => { SafekeeperPostgresCommand::StartWalPush { pageserver_connstr } => {
ReceiveWalConn::new(pgb, pageserver_connstr) ReceiveWalConn::new(pgb, pageserver_connstr)
.run(self) .run(self)
.with_context(|| "failed to run ReceiveWalConn")?; .context("failed to run ReceiveWalConn")?;
} }
SafekeeperPostgresCommand::StartReplication { start_lsn } => { SafekeeperPostgresCommand::StartReplication { start_lsn } => {
ReplicationConn::new(pgb) ReplicationConn::new(pgb)
.run(self, pgb, start_lsn) .run(self, pgb, start_lsn, self.pageserver_connstr.clone())
.with_context(|| "failed to run ReplicationConn")?; .context("failed to run ReplicationConn")?;
} }
SafekeeperPostgresCommand::IdentifySystem => { SafekeeperPostgresCommand::IdentifySystem => {
self.handle_identify_system(pgb)?; self.handle_identify_system(pgb)?;
@@ -154,6 +155,7 @@ impl SafekeeperPostgresHandler {
ztenantid: None, ztenantid: None,
ztimelineid: None, ztimelineid: None,
timeline: None, timeline: None,
pageserver_connstr: None,
tx, tx,
} }
} }

View File

@@ -6,6 +6,7 @@ use serde::Serializer;
use std::fmt::Display; use std::fmt::Display;
use std::sync::Arc; use std::sync::Arc;
use zenith_utils::lsn::Lsn; use zenith_utils::lsn::Lsn;
use zenith_utils::zid::ZTenantTimelineId;
use crate::safekeeper::Term; use crate::safekeeper::Term;
use crate::safekeeper::TermHistory; use crate::safekeeper::TermHistory;
@@ -65,16 +66,13 @@ struct TimelineStatus {
/// Report info about timeline. /// Report info about timeline.
async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> { async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; let zttid = ZTenantTimelineId::new(
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; parse_request_param(&request, "tenant_id")?,
parse_request_param(&request, "timeline_id")?,
);
let tli = GlobalTimelines::get( let tli = GlobalTimelines::get(get_conf(&request), zttid, CreateControlFile::False)
get_conf(&request), .map_err(ApiError::from_err)?;
tenant_id,
timeline_id,
CreateControlFile::False,
)
.map_err(ApiError::from_err)?;
let sk_state = tli.get_info(); let sk_state = tli.get_info();
let flush_lsn = tli.get_end_of_wal(); let flush_lsn = tli.get_end_of_wal();
@@ -85,8 +83,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
}; };
let status = TimelineStatus { let status = TimelineStatus {
tenant_id, tenant_id: zttid.tenant_id,
timeline_id, timeline_id: zttid.timeline_id,
acceptor_state: acc_state, acceptor_state: acc_state,
commit_lsn: sk_state.commit_lsn, commit_lsn: sk_state.commit_lsn,
truncate_lsn: sk_state.truncate_lsn, truncate_lsn: sk_state.truncate_lsn,

View File

@@ -2,7 +2,7 @@
use std::path::PathBuf; use std::path::PathBuf;
use std::time::Duration; use std::time::Duration;
use zenith_utils::zid::ZTimelineId; use zenith_utils::zid::ZTenantTimelineId;
pub mod callmemaybe; pub mod callmemaybe;
pub mod handler; pub mod handler;
@@ -47,8 +47,10 @@ pub struct SafeKeeperConf {
} }
impl SafeKeeperConf { impl SafeKeeperConf {
pub fn timeline_dir(&self, timelineid: &ZTimelineId) -> PathBuf { pub fn timeline_dir(&self, zttid: &ZTenantTimelineId) -> PathBuf {
self.workdir.join(timelineid.to_string()) self.workdir
.join(zttid.tenant_id.to_string())
.join(zttid.timeline_id.to_string())
} }
} }
@@ -62,7 +64,7 @@ impl Default for SafeKeeperConf {
daemonize: false, daemonize: false,
no_sync: false, no_sync: false,
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
listen_http_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
ttl: None, ttl: None,
recall_period: defaults::DEFAULT_RECALL_PERIOD, recall_period: defaults::DEFAULT_RECALL_PERIOD,
} }

Some files were not shown because too many files have changed in this diff Show More