mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-18 10:52:55 +00:00
Compare commits
497 Commits
compute_no
...
parallel_w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
de02cc9ee4 | ||
|
|
96e73fb585 | ||
|
|
a533d22f71 | ||
|
|
0ab4792943 | ||
|
|
c60e3e2337 | ||
|
|
1e0e3fbde0 | ||
|
|
7c96c638aa | ||
|
|
9838c71a47 | ||
|
|
79d9314ba6 | ||
|
|
2b33894e7b | ||
|
|
a118557331 | ||
|
|
8ec234ba78 | ||
|
|
70926adaba | ||
|
|
560f088f05 | ||
|
|
aa404b60fe | ||
|
|
1b6d99db7c | ||
|
|
605b90c6c7 | ||
|
|
6f747893be | ||
|
|
dab34c3dd6 | ||
|
|
bf45bef284 | ||
|
|
d55095ab21 | ||
|
|
a048e0c7c1 | ||
|
|
9436c4161f | ||
|
|
e74b06d999 | ||
|
|
f6705b7a7d | ||
|
|
386403dcd1 | ||
|
|
46e613f423 | ||
|
|
56da87cadc | ||
|
|
dcabe694ee | ||
|
|
842419b91f | ||
|
|
3cded20662 | ||
|
|
eb0a56eb22 | ||
|
|
8a541147e2 | ||
|
|
ed0fcfa9b7 | ||
|
|
c5509b05de | ||
|
|
befefe8d84 | ||
|
|
ad92b66eed | ||
|
|
d119f2bcce | ||
|
|
038dc6c629 | ||
|
|
bfc27bee5e | ||
|
|
19528de03e | ||
|
|
3e69c41a47 | ||
|
|
97681acfcf | ||
|
|
baf8800b96 | ||
|
|
577af8a459 | ||
|
|
75e717fe86 | ||
|
|
4987d5ee1f | ||
|
|
462b8801d2 | ||
|
|
2712eaee15 | ||
|
|
96f4ddd243 | ||
|
|
3386ce6f35 | ||
|
|
9c856ecf43 | ||
|
|
d4329887b3 | ||
|
|
ced338fd20 | ||
|
|
44c35722d8 | ||
|
|
ec44f4b299 | ||
|
|
a31bba19b0 | ||
|
|
0dd46061a8 | ||
|
|
cb2ddf06d0 | ||
|
|
eb7388e3e8 | ||
|
|
b314311f49 | ||
|
|
183a3022a5 | ||
|
|
226204094a | ||
|
|
4f1b22a2c8 | ||
|
|
257ade0688 | ||
|
|
43ece6e2a2 | ||
|
|
f923464b93 | ||
|
|
11efafb05b | ||
|
|
7c5532303e | ||
|
|
39c1d4896c | ||
|
|
d2d5a01522 | ||
|
|
36d6c401bf | ||
|
|
37b0236e9a | ||
|
|
cc169a6896 | ||
|
|
77366b7a76 | ||
|
|
9af04b3b8d | ||
|
|
9b7b311815 | ||
|
|
cb4f5e911c | ||
|
|
6403f1745c | ||
|
|
6d7942ece4 | ||
|
|
09b2c66cf6 | ||
|
|
5c70b52f4a | ||
|
|
19602dc88a | ||
|
|
2b66049b21 | ||
|
|
c3011359ab | ||
|
|
da117f431d | ||
|
|
78aad4fe3f | ||
|
|
1c5be12515 | ||
|
|
96c7594d29 | ||
|
|
7a3794ef18 | ||
|
|
bf56ea8c43 | ||
|
|
bb1446e33a | ||
|
|
0969574d48 | ||
|
|
05a681be2c | ||
|
|
507c1fbbac | ||
|
|
b2f51026aa | ||
|
|
2b0193e6bf | ||
|
|
c31a5e2c8f | ||
|
|
d85d67a6f1 | ||
|
|
9b8e82c6cf | ||
|
|
434374ebb4 | ||
|
|
a7ae552851 | ||
|
|
8b5a061c8e | ||
|
|
8147aa7e93 | ||
|
|
d18cc8a3a8 | ||
|
|
762e9859d6 | ||
|
|
924261f7db | ||
|
|
063429aade | ||
|
|
445e88f50b | ||
|
|
47694ea4f5 | ||
|
|
3364a8d442 | ||
|
|
f2243d7459 | ||
|
|
244fcffc50 | ||
|
|
8c3c9c3394 | ||
|
|
00ce635da9 | ||
|
|
7d5f7462c1 | ||
|
|
bed75f800f | ||
|
|
0e423d481e | ||
|
|
0c74f6fa4e | ||
|
|
fc01fae9b4 | ||
|
|
558a2214bc | ||
|
|
31815bccc0 | ||
|
|
e12cab7c17 | ||
|
|
002cd8ed5b | ||
|
|
588a030803 | ||
|
|
0b9bbdc4ec | ||
|
|
922715a923 | ||
|
|
ac60b68d50 | ||
|
|
1aceea1bdd | ||
|
|
e0cc4dee4f | ||
|
|
fd20101e5c | ||
|
|
5a73a6fdfc | ||
|
|
4608b1ec70 | ||
|
|
ccb2eea7fd | ||
|
|
6b615cbde1 | ||
|
|
adc0e04205 | ||
|
|
78e3edf2e9 | ||
|
|
4b6563a55d | ||
|
|
34f4207501 | ||
|
|
d1d2d5ce69 | ||
|
|
60ea26316e | ||
|
|
cffc979058 | ||
|
|
1af6607fc3 | ||
|
|
cb6e2d9ddb | ||
|
|
1ccf82f932 | ||
|
|
b1a424dfa9 | ||
|
|
263acef1cc | ||
|
|
7c73afc1af | ||
|
|
e8f0a9bb80 | ||
|
|
6f9175ca2d | ||
|
|
69fa10ff86 | ||
|
|
d5fe515363 | ||
|
|
6a9c036ac1 | ||
|
|
6f9a582973 | ||
|
|
a0e23e6f3f | ||
|
|
84508d4f68 | ||
|
|
fb230dcf32 | ||
|
|
4aabc9a682 | ||
|
|
0fe81b2993 | ||
|
|
eb1f1a347d | ||
|
|
064aa44a06 | ||
|
|
d6ee61b5cf | ||
|
|
4b78a16b82 | ||
|
|
c093ee5e4b | ||
|
|
7685372cae | ||
|
|
ce54133ec4 | ||
|
|
610e14a7fc | ||
|
|
35a1c3d521 | ||
|
|
22b7e74c83 | ||
|
|
d95e1da742 | ||
|
|
40d047c146 | ||
|
|
42f3dd47d2 | ||
|
|
c2b2ab974c | ||
|
|
6ad6e5bd84 | ||
|
|
d534aeb9e1 | ||
|
|
d45839879c | ||
|
|
1f6ca23db6 | ||
|
|
2127a65e27 | ||
|
|
ecf2d181c4 | ||
|
|
c1bfa32771 | ||
|
|
8465738aa5 | ||
|
|
87d7ce816d | ||
|
|
f38c2e620e | ||
|
|
86056abd0e | ||
|
|
2bf2dd1d88 | ||
|
|
874d82fd4c | ||
|
|
3645133700 | ||
|
|
20b6279beb | ||
|
|
06f96f9600 | ||
|
|
b5f60f3874 | ||
|
|
0ec56cd21f | ||
|
|
600e1a0080 | ||
|
|
9c94a34ae7 | ||
|
|
9c0ac251df | ||
|
|
872ed24408 | ||
|
|
2f25d17e11 | ||
|
|
8faa6fa392 | ||
|
|
4d5a41301d | ||
|
|
4c35b22626 | ||
|
|
9fe3b73e13 | ||
|
|
e0146304e6 | ||
|
|
fbb04c592a | ||
|
|
8f43d7637c | ||
|
|
cf30303d8f | ||
|
|
1ec157653e | ||
|
|
858ca3a4ce | ||
|
|
d744ddee7c | ||
|
|
3296b7d770 | ||
|
|
2148ae78ab | ||
|
|
78dcf2207e | ||
|
|
74b78608d9 | ||
|
|
a11558b84f | ||
|
|
513696a485 | ||
|
|
cedc2eb5c2 | ||
|
|
e3e593f571 | ||
|
|
c12e393e74 | ||
|
|
d59cb2ca7a | ||
|
|
58f34a8d76 | ||
|
|
31462f4b71 | ||
|
|
538f903861 | ||
|
|
e6a7241c3a | ||
|
|
709b778904 | ||
|
|
aa8debf4e8 | ||
|
|
1912546e52 | ||
|
|
a6178c135f | ||
|
|
2ff16da6af | ||
|
|
21ea70c8f5 | ||
|
|
2b2d24433a | ||
|
|
66bced0f36 | ||
|
|
9ba7bc2695 | ||
|
|
8624bddc79 | ||
|
|
45b1495f37 | ||
|
|
23be5021f8 | ||
|
|
f954d5c501 | ||
|
|
ab2f0ad1a8 | ||
|
|
52fbcbde0a | ||
|
|
e602807476 | ||
|
|
398d522d88 | ||
|
|
746f667311 | ||
|
|
53ea6702bd | ||
|
|
952424b78c | ||
|
|
d737c40eec | ||
|
|
532918e13d | ||
|
|
b266c28345 | ||
|
|
04dc698d4b | ||
|
|
6b11b4250e | ||
|
|
15d1c1f8bf | ||
|
|
9ece1e863d | ||
|
|
2870150365 | ||
|
|
7b281900f9 | ||
|
|
97992226d3 | ||
|
|
270356ec38 | ||
|
|
c2db828481 | ||
|
|
71e93faed7 | ||
|
|
54d52e07db | ||
|
|
4dccdb33ab | ||
|
|
38c4b6f02f | ||
|
|
6ff3f1b9fd | ||
|
|
4c5e23d014 | ||
|
|
99d80aba52 | ||
|
|
2f2dff4c8d | ||
|
|
22e7fcbf2d | ||
|
|
372617a4f5 | ||
|
|
49d1921a28 | ||
|
|
d8e509d29e | ||
|
|
d5bfe84d9e | ||
|
|
8fff26ad49 | ||
|
|
5f4e32f505 | ||
|
|
fb71c85a79 | ||
|
|
ff76226a35 | ||
|
|
6e748147b6 | ||
|
|
e5df42feef | ||
|
|
73647e5715 | ||
|
|
95db33f3f9 | ||
|
|
bace19ffbe | ||
|
|
60d66267a9 | ||
|
|
294320e6a8 | ||
|
|
28b4d9abb3 | ||
|
|
8d8bc304c1 | ||
|
|
4788248e11 | ||
|
|
0cbb3798da | ||
|
|
36c12247b9 | ||
|
|
1767208563 | ||
|
|
d25656797c | ||
|
|
6c825dcbaa | ||
|
|
4b46693c81 | ||
|
|
8952066ecb | ||
|
|
d26b76fe7c | ||
|
|
df5a55c445 | ||
|
|
e5e5c3e067 | ||
|
|
b7575582b8 | ||
|
|
77fd24b950 | ||
|
|
61af9bb889 | ||
|
|
a68f60415b | ||
|
|
e7ca580922 | ||
|
|
33d126ecbe | ||
|
|
15db0d1d6f | ||
|
|
29f122009a | ||
|
|
bf0a0cb55d | ||
|
|
0fe5abadf5 | ||
|
|
1591f058c6 | ||
|
|
efa4ecaa7c | ||
|
|
8e57c2e413 | ||
|
|
4dd63821bd | ||
|
|
eeec1a3dcb | ||
|
|
b484b896b6 | ||
|
|
e5413be5fa | ||
|
|
b9c0d22045 | ||
|
|
2e0d45d092 | ||
|
|
86932c20eb | ||
|
|
f5b45a172c | ||
|
|
e6a0987182 | ||
|
|
aa64391265 | ||
|
|
aac913f9dc | ||
|
|
4e2e5bb4e6 | ||
|
|
3e15a5c325 | ||
|
|
ce646ea845 | ||
|
|
effcabb590 | ||
|
|
a08dfb1c2c | ||
|
|
a3818dee58 | ||
|
|
219cbe2d9c | ||
|
|
129f85f652 | ||
|
|
790f1b05c6 | ||
|
|
37cd662ab2 | ||
|
|
277a4d4582 | ||
|
|
1cdeba9db7 | ||
|
|
7d104e5660 | ||
|
|
49530145d8 | ||
|
|
da96965897 | ||
|
|
3762b53986 | ||
|
|
9ad99152b8 | ||
|
|
651a8139f5 | ||
|
|
f82c3eb5e2 | ||
|
|
eea6f0898e | ||
|
|
086c0ad829 | ||
|
|
6c7ea82a61 | ||
|
|
b77597bd99 | ||
|
|
68aa2febc9 | ||
|
|
1369145e83 | ||
|
|
b49164a1d4 | ||
|
|
e7b112aacc | ||
|
|
f491a22d85 | ||
|
|
26115818b7 | ||
|
|
158d1bbbb4 | ||
|
|
6a43b293ad | ||
|
|
69df9f10ed | ||
|
|
61aee52a90 | ||
|
|
975b2d12dc | ||
|
|
ab61ce2267 | ||
|
|
14168c7aa7 | ||
|
|
7a8501d12f | ||
|
|
34d55b09a3 | ||
|
|
41a3772e90 | ||
|
|
bbec5a13bd | ||
|
|
421d586953 | ||
|
|
ef37eb96b9 | ||
|
|
d311f708b6 | ||
|
|
c7f54af1f1 | ||
|
|
44a85d9176 | ||
|
|
96beffb3c5 | ||
|
|
cff671c1bd | ||
|
|
4acdcbe90f | ||
|
|
fdf6829de5 | ||
|
|
b361558a8a | ||
|
|
c59830fd01 | ||
|
|
636194406f | ||
|
|
3b09a74f58 | ||
|
|
f617115467 | ||
|
|
4f529b7d4a | ||
|
|
bc652e965e | ||
|
|
3b9e7fc5e6 | ||
|
|
5292b502f3 | ||
|
|
abcecc992e | ||
|
|
96b6f350a7 | ||
|
|
648755a25e | ||
|
|
1c775bdcac | ||
|
|
07d0241076 | ||
|
|
d760446053 | ||
|
|
01e239afa3 | ||
|
|
f62ce4bcf7 | ||
|
|
3d3eb0ed16 | ||
|
|
da9bf5dc63 | ||
|
|
1cb9b5523b | ||
|
|
968cd8f20c | ||
|
|
3e007b0eb9 | ||
|
|
5e0cc89de8 | ||
|
|
0fc05569e0 | ||
|
|
021462da3e | ||
|
|
93d7d2ae2a | ||
|
|
fe79082e29 | ||
|
|
6dfe196c40 | ||
|
|
8beaf76c85 | ||
|
|
499b4f7eba | ||
|
|
52ee3a2bac | ||
|
|
b64bd2a8af | ||
|
|
573f1ada83 | ||
|
|
904ccbdb70 | ||
|
|
59b23fef64 | ||
|
|
0eaff5aa7f | ||
|
|
db5712f28b | ||
|
|
5f277755b1 | ||
|
|
ee87e6aad3 | ||
|
|
ff3488fadd | ||
|
|
4a0a9e748c | ||
|
|
6aa38d3f7d | ||
|
|
28f2800275 | ||
|
|
8af5cbedb1 | ||
|
|
75baf670f5 | ||
|
|
2ca8fbb6ff | ||
|
|
546266b86d | ||
|
|
c5a8c31b8a | ||
|
|
bab954b87f | ||
|
|
3ded550272 | ||
|
|
ed30f2096c | ||
|
|
da9508716d | ||
|
|
2dbbb8c59b | ||
|
|
f3192ee415 | ||
|
|
9e7c45cb72 | ||
|
|
18ba16aaac | ||
|
|
a4fd1e1a80 | ||
|
|
9b71ae7dce | ||
|
|
2cd730d31f | ||
|
|
8060e17b50 | ||
|
|
1f3f4cfaf5 | ||
|
|
a22cb7acc1 | ||
|
|
785502c92c | ||
|
|
69b786040e | ||
|
|
4f3f0304c2 | ||
|
|
c981f4ad66 | ||
|
|
c794f128cc | ||
|
|
220a023e51 | ||
|
|
e911427872 | ||
|
|
eb42fbadeb | ||
|
|
d8fa2ec367 | ||
|
|
07507274c0 | ||
|
|
92e4f4b3b6 | ||
|
|
b5a5ea5831 | ||
|
|
f387769203 | ||
|
|
7f777a485e | ||
|
|
d8ab2e00cb | ||
|
|
f520ef9a64 | ||
|
|
d047a3abf7 | ||
|
|
f69db17409 | ||
|
|
3600b33f1c | ||
|
|
2c5fb6d6c8 | ||
|
|
8604bb8750 | ||
|
|
936cad17e4 | ||
|
|
fa5d31056b | ||
|
|
583f64768f | ||
|
|
c5d56ffe22 | ||
|
|
b451ede199 | ||
|
|
533087fd5d | ||
|
|
95160dee6d | ||
|
|
8aa3013ec2 | ||
|
|
8879f747ee | ||
|
|
9809613c6f | ||
|
|
8d1bf152cf | ||
|
|
3725815935 | ||
|
|
b32cc6a088 | ||
|
|
3c7f810849 | ||
|
|
1e65848551 | ||
|
|
e03417a7c9 | ||
|
|
33ee5b6ba0 | ||
|
|
52d6275812 | ||
|
|
639c9e8266 | ||
|
|
35e0099ac6 | ||
|
|
4ff248515b | ||
|
|
8b70ea4d79 | ||
|
|
2246b48348 | ||
|
|
e8032f26e6 | ||
|
|
d2c3ad162a | ||
|
|
b4c5cb2773 | ||
|
|
b67df00bff | ||
|
|
24c3e961e4 | ||
|
|
92fb7a1641 | ||
|
|
05886b33e5 | ||
|
|
d7eeaec706 | ||
|
|
1190030872 | ||
|
|
913a91c541 | ||
|
|
24b925d528 | ||
|
|
82dc1e82ba | ||
|
|
2e9c730dd1 | ||
|
|
6266fd102c | ||
|
|
d9bc2109bb | ||
|
|
d1d6c968d5 | ||
|
|
3c4ebc4030 | ||
|
|
46543f54a6 | ||
|
|
b07fa4c896 | ||
|
|
f35d13183e | ||
|
|
c5f379bff3 | ||
|
|
39ebec51d1 | ||
|
|
6264dc6aa3 | ||
|
|
59163cf3b3 | ||
|
|
a606336074 | ||
|
|
1816c4ca0a | ||
|
|
542dffa4a6 | ||
|
|
07fb30747a |
267
.circleci/config.yml
Normal file
267
.circleci/config.yml
Normal file
@@ -0,0 +1,267 @@
|
||||
version: 2.1
|
||||
|
||||
orbs:
|
||||
python: circleci/python@1.4.0
|
||||
|
||||
executors:
|
||||
zenith-build-executor:
|
||||
resource_class: xlarge
|
||||
docker:
|
||||
- image: cimg/rust:1.51.0
|
||||
|
||||
jobs:
|
||||
|
||||
# A job to build postgres
|
||||
build-postgres:
|
||||
executor: zenith-build-executor
|
||||
steps:
|
||||
# Checkout the git repo (circleci doesn't have a flag to enable submodules here)
|
||||
- checkout
|
||||
|
||||
# Grab the postgres git revision to build a cache key.
|
||||
# Note this works even though the submodule hasn't been checkout out yet.
|
||||
- run:
|
||||
name: Get postgres cache key
|
||||
command: |
|
||||
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
|
||||
|
||||
- restore_cache:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
# FIXME We could cache our own docker container, instead of installing packages every time.
|
||||
- run:
|
||||
name: apt install dependencies
|
||||
command: |
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
|
||||
fi
|
||||
|
||||
# Build postgres if the restore_cache didn't find a build.
|
||||
# `make` can't figure out whether the cache is valid, since
|
||||
# it only compares file timestamps.
|
||||
- run:
|
||||
name: build postgres
|
||||
command: |
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
# "depth 1" saves some time by not cloning the whole repo
|
||||
git submodule update --init --depth 1
|
||||
make postgres
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save postgres cache
|
||||
key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
# A job to build zenith rust code
|
||||
build-zenith:
|
||||
executor: zenith-build-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
enum: ["debug", "release"]
|
||||
steps:
|
||||
- run:
|
||||
name: apt install dependencies
|
||||
command: |
|
||||
sudo apt update
|
||||
sudo apt install libssl-dev clang
|
||||
|
||||
# Checkout the git repo (without submodules)
|
||||
- checkout
|
||||
|
||||
# Grab the postgres git revision to build a cache key.
|
||||
# Note this works even though the submodule hasn't been checkout out yet.
|
||||
- run:
|
||||
name: Get postgres cache key
|
||||
command: |
|
||||
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
|
||||
|
||||
- restore_cache:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
- restore_cache:
|
||||
name: Restore rust cache
|
||||
keys:
|
||||
# Require an exact match. While an out of date cache might speed up the build,
|
||||
# there's no way to clean out old packages, so the cache grows every time something
|
||||
# changes.
|
||||
- v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
|
||||
# Build the rust code, including test binaries
|
||||
- run:
|
||||
name: Rust build << parameters.build_type >>
|
||||
command: |
|
||||
export CARGO_INCREMENTAL=0
|
||||
BUILD_TYPE="<< parameters.build_type >>"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
echo "Build in debug mode"
|
||||
cargo build --bins --tests
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
echo "Build in release mode"
|
||||
cargo build --release --bins --tests
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save rust cache
|
||||
key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
paths:
|
||||
- ~/.cargo/registry
|
||||
- ~/.cargo/git
|
||||
- target
|
||||
|
||||
# Run rust unit tests
|
||||
- run: cargo test
|
||||
|
||||
# Install the rust binaries, for use by test jobs
|
||||
# `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
|
||||
# FIXME: this is a really silly way to install; maybe we should just output
|
||||
# a tarball as an artifact? Or a .deb package?
|
||||
- run:
|
||||
name: cargo install
|
||||
command: |
|
||||
export CARGO_INCREMENTAL=0
|
||||
BUILD_TYPE="<< parameters.build_type >>"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
echo "Install debug mode"
|
||||
CARGO_FLAGS="--debug"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
echo "Install release mode"
|
||||
# The default is release mode; there is no --release flag.
|
||||
CARGO_FLAGS=""
|
||||
fi
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path pageserver
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path walkeeper
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path zenith
|
||||
|
||||
# Install the postgres binaries, for use by test jobs
|
||||
# FIXME: this is a silly way to do "install"; maybe just output a standard
|
||||
# postgres package, whatever the favored form is (tarball? .deb package?)
|
||||
# Note that pg_regress needs some build artifacts that probably aren't
|
||||
# in the usual package...?
|
||||
- run:
|
||||
name: postgres install
|
||||
command: |
|
||||
cp -a tmp_install /tmp/zenith/pg_install
|
||||
|
||||
# Save the rust output binaries for other jobs in this workflow.
|
||||
- persist_to_workspace:
|
||||
root: /tmp/zenith
|
||||
paths:
|
||||
- "*"
|
||||
|
||||
run-pytest:
|
||||
#description: "Run pytest"
|
||||
executor: python/default
|
||||
parameters:
|
||||
# pytest args to specify the tests to run.
|
||||
#
|
||||
# This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory,
|
||||
# or '-k foobar' to run tests containing string 'foobar'. See pytest man page
|
||||
# section SPECIFYING TESTS / SELECTING TESTS for details.
|
||||
#
|
||||
# Select the type of Rust build. Must be "release" or "debug".
|
||||
build_type:
|
||||
type: string
|
||||
default: "debug"
|
||||
# This parameter is required, to prevent the mistake of running all tests in one job.
|
||||
test_selection:
|
||||
type: string
|
||||
default: ""
|
||||
# Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr
|
||||
extra_params:
|
||||
type: string
|
||||
default: ""
|
||||
needs_postgres_source:
|
||||
type: boolean
|
||||
default: false
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
- checkout
|
||||
- when:
|
||||
condition: << parameters.needs_postgres_source >>
|
||||
steps:
|
||||
- run: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Install pipenv & deps
|
||||
working_directory: test_runner
|
||||
command: |
|
||||
pip install pipenv
|
||||
pipenv install
|
||||
- run:
|
||||
name: Run pytest
|
||||
working_directory: test_runner
|
||||
environment:
|
||||
- ZENITH_BIN: /tmp/zenith/bin
|
||||
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
|
||||
- TEST_OUTPUT: /tmp/test_output
|
||||
command: |
|
||||
TEST_SELECTION="<< parameters.test_selection >>"
|
||||
EXTRA_PARAMS="<< parameters.extra_params >>"
|
||||
if [ -z "$TEST_SELECTION" ]; then
|
||||
echo "test_selection must be set"
|
||||
exit 1
|
||||
fi
|
||||
# Run the tests.
|
||||
#
|
||||
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||
# in its "Tests" tab in the results page.
|
||||
# -s prevents pytest from capturing output, which helps to see
|
||||
# what's going on if the test hangs
|
||||
# --verbose prints name of each test (helpful when there are
|
||||
# multiple tests in one file)
|
||||
# -rA prints summary in the end
|
||||
pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short -s --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
- run:
|
||||
# CircleCI artifacts are preserved one file at a time, so skipping
|
||||
# this step isn't a good idea. If you want to extract the
|
||||
# pageserver state, perhaps a tarball would be a better idea.
|
||||
name: Delete all data but logs
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
# The store_test_results step tells CircleCI where to find the junit.xml file.
|
||||
- store_test_results:
|
||||
path: /tmp/test_output
|
||||
|
||||
workflows:
|
||||
build_and_test:
|
||||
jobs:
|
||||
- build-postgres
|
||||
- build-zenith:
|
||||
name: build-zenith-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
requires:
|
||||
- build-postgres
|
||||
- run-pytest:
|
||||
name: pg_regress tests << matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
test_selection: batch_pg_regress
|
||||
needs_postgres_source: true
|
||||
requires:
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: other tests << matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
test_selection: batch_others
|
||||
requires:
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
13
.dockerignore
Normal file
13
.dockerignore
Normal file
@@ -0,0 +1,13 @@
|
||||
**/.git/
|
||||
**/__pycache__
|
||||
**/.pytest_cache
|
||||
|
||||
/target
|
||||
/tmp_check
|
||||
/tmp_install
|
||||
/tmp_check_cli
|
||||
/test_output
|
||||
/.vscode
|
||||
/.zenith
|
||||
/integration_tests/.zenith
|
||||
/Dockerfile
|
||||
45
.github/workflows/notifications.yml
vendored
Normal file
45
.github/workflows/notifications.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
name: Send Notifications
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
send-notifications:
|
||||
timeout-minutes: 30
|
||||
name: send commit notifications
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Form variables for notification message
|
||||
id: git_info_grab
|
||||
run: |
|
||||
git_stat=$(git show --stat=50)
|
||||
git_stat="${git_stat//'%'/'%25'}"
|
||||
git_stat="${git_stat//$'\n'/'%0A'}"
|
||||
git_stat="${git_stat//$'\r'/'%0D'}"
|
||||
git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces
|
||||
echo "::set-output name=git_stat::$git_stat"
|
||||
echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
|
||||
echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
|
||||
|
||||
- name: Send notification
|
||||
uses: appleboy/telegram-action@master
|
||||
with:
|
||||
to: ${{ secrets.TELEGRAM_TO }}
|
||||
token: ${{ secrets.TELEGRAM_TOKEN }}
|
||||
format: markdown
|
||||
args: |
|
||||
*@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
|
||||
|
||||
```
|
||||
${{ steps.git_info_grab.outputs.git_stat }}
|
||||
```
|
||||
|
||||
65
.github/workflows/testing.yml
vendored
65
.github/workflows/testing.yml
vendored
@@ -1,49 +1,41 @@
|
||||
name: regression check
|
||||
name: Build and Test
|
||||
|
||||
on: [push]
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
regression-check:
|
||||
strategy:
|
||||
matrix:
|
||||
# If we want to duplicate this job for different
|
||||
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
|
||||
rust_toolchain: [stable]
|
||||
os: [ubuntu-latest]
|
||||
timeout-minutes: 30
|
||||
name: run regression test suite
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Form variables for notification message
|
||||
id: git_info_grab
|
||||
run: |
|
||||
git_stat=$(git show --stat=50)
|
||||
git_stat="${git_stat//'%'/'%25'}"
|
||||
git_stat="${git_stat//$'\n'/'%0A'}"
|
||||
git_stat="${git_stat//$'\r'/'%0D'}"
|
||||
git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces
|
||||
echo "::set-output name=git_stat::$git_stat"
|
||||
echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
|
||||
echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
|
||||
|
||||
- name: Send notification
|
||||
uses: appleboy/telegram-action@master
|
||||
- name: install rust toolchain ${{ matrix.rust_toolchain }}
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
to: ${{ secrets.TELEGRAM_TO }}
|
||||
token: ${{ secrets.TELEGRAM_TOKEN }}
|
||||
format: markdown
|
||||
args: |
|
||||
*@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
|
||||
|
||||
```
|
||||
${{ steps.git_info_grab.outputs.git_stat }}
|
||||
```
|
||||
profile: minimal
|
||||
toolchain: ${{ matrix.rust_toolchain }}
|
||||
override: true
|
||||
|
||||
- name: Install postgres dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libxml2-dev libcurl4-openssl-dev
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
|
||||
|
||||
- name: Set pg revision for caching
|
||||
id: pg_ver
|
||||
@@ -60,11 +52,7 @@ jobs:
|
||||
- name: Build postgres
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
./pgbuild.sh
|
||||
|
||||
- name: Install rust
|
||||
run: |
|
||||
sudo apt install -y cargo
|
||||
make postgres
|
||||
|
||||
- name: Cache cargo deps
|
||||
id: cache_cargo
|
||||
@@ -76,13 +64,10 @@ jobs:
|
||||
target
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
|
||||
|
||||
# That build is only to build dependencies and can be skipped if Cargo.lock
|
||||
# wasn't changed. Next steps need their own build
|
||||
- name: Install cargo deps
|
||||
if: steps.cache_cargo.outputs.cache-hit != 'true'
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
cargo build
|
||||
cargo build --workspace --bins --examples --tests
|
||||
|
||||
- name: Run test
|
||||
- name: Run cargo test
|
||||
run: |
|
||||
cargo test --test test_pageserver -- --nocapture --test-threads=1
|
||||
cargo test -- --nocapture --test-threads=1
|
||||
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -1,3 +1,9 @@
|
||||
/target
|
||||
/tmp_check
|
||||
/tmp_install
|
||||
/tmp_check_cli
|
||||
__pycache__/
|
||||
test_output/
|
||||
.vscode
|
||||
/.zenith
|
||||
/integration_tests/.zenith
|
||||
|
||||
31
CONTRIBUTING.md
Normal file
31
CONTRIBUTING.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# How to contribute
|
||||
|
||||
Howdy! Usual good software engineering practices apply. Write
|
||||
tests. Write comments. Follow standard Rust coding practices where
|
||||
possible. Use 'cargo fmt' and 'clippy' to tidy up formatting.
|
||||
|
||||
There are soft spots in the code, which could use cleanup,
|
||||
refactoring, additional comments, and so forth. Let's try to raise the
|
||||
bar, and clean things up as we go. Try to leave code in a better shape
|
||||
than it was before.
|
||||
|
||||
## Submitting changes
|
||||
|
||||
1. Make a PR for every change.
|
||||
|
||||
Even seemingly trivial patches can break things in surprising ways.
|
||||
Use of common sense is OK. If you're only fixing a typo in a comment,
|
||||
it's probably fine to just push it. But if in doubt, open a PR.
|
||||
|
||||
2. Get at least one +1 on your PR before you push.
|
||||
|
||||
For simple patches, it will only take a minute for someone to review
|
||||
it.
|
||||
|
||||
3. Always keep the CI green.
|
||||
|
||||
Do not push, if the CI failed on your PR. Even if you think it's not
|
||||
your patch's fault. Help to fix the root cause if something else has
|
||||
broken the CI, before pushing.
|
||||
|
||||
*Happy Hacking!*
|
||||
20
COPYRIGHT
Normal file
20
COPYRIGHT
Normal file
@@ -0,0 +1,20 @@
|
||||
This software is licensed under the Apache 2.0 License:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
Copyright 2021 Zenith Labs, Inc
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
The PostgreSQL submodule in vendor/postgres is licensed under the
|
||||
PostgreSQL license. See vendor/postgres/COPYRIGHT.
|
||||
1318
Cargo.lock
generated
1318
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
12
Cargo.toml
12
Cargo.toml
@@ -1,6 +1,16 @@
|
||||
[workspace]
|
||||
members = [
|
||||
"integration_tests",
|
||||
"pageserver",
|
||||
"walkeeper",
|
||||
"zenith",
|
||||
"control_plane",
|
||||
"postgres_ffi",
|
||||
"zenith_utils",
|
||||
"workspace_hack",
|
||||
"proxy"
|
||||
]
|
||||
|
||||
[profile.release]
|
||||
# This is useful for profiling and, to some extent, debug.
|
||||
# Besides, debug info should not affect the performance.
|
||||
debug = true
|
||||
|
||||
95
Dockerfile
Normal file
95
Dockerfile
Normal file
@@ -0,0 +1,95 @@
|
||||
#
|
||||
# Docker image for console integration testing.
|
||||
#
|
||||
# We may also reuse it in CI to unify installation process and as a general binaries building
|
||||
# tool for production servers.
|
||||
#
|
||||
# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls
|
||||
# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust
|
||||
# images which are statically linked and have guards against any dlopen. I would rather
|
||||
# prefer all static binaries so we may change the way librocksdb-sys builds or wait until
|
||||
# we will have our own storage and drop rockdb dependency.
|
||||
#
|
||||
# Cargo-chef is used to separate dependencies building from main binaries building. This
|
||||
# way `docker build` will download and install dependencies only of there are changes to
|
||||
# out Cargo.toml files.
|
||||
#
|
||||
|
||||
|
||||
#
|
||||
# build postgres separately -- this layer will be rebuilt only if one of
|
||||
# mentioned paths will get any changes
|
||||
#
|
||||
FROM alpine:3.13 as pg-build
|
||||
RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \
|
||||
make bison flex readline-dev zlib-dev perl linux-headers
|
||||
WORKDIR zenith
|
||||
COPY ./vendor/postgres vendor/postgres
|
||||
COPY ./Makefile Makefile
|
||||
# Build using clang and lld
|
||||
RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4
|
||||
|
||||
#
|
||||
# Calculate cargo dependencies.
|
||||
# This will always run, but only generate recipe.json with list of dependencies without
|
||||
# installing them.
|
||||
#
|
||||
FROM alpine:20210212 as cargo-deps-inspect
|
||||
RUN apk add --update rust cargo
|
||||
RUN cargo install cargo-chef
|
||||
WORKDIR zenith
|
||||
COPY . .
|
||||
RUN cargo chef prepare --recipe-path recipe.json
|
||||
|
||||
#
|
||||
# Build cargo dependencies.
|
||||
# This temp cantainner would be build only if recipe.json was changed.
|
||||
#
|
||||
FROM alpine:20210212 as deps-build
|
||||
RUN apk add --update rust cargo openssl-dev clang build-base
|
||||
# rust-rocksdb can be built against system-wide rocksdb -- that saves about
|
||||
# 10 minutes during build. Rocksdb apk package is in testing now, but use it
|
||||
# anyway. In case of any troubles we can download and build rocksdb here manually
|
||||
# (to cache it as a docker layer).
|
||||
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
|
||||
WORKDIR zenith
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/
|
||||
COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
|
||||
RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json
|
||||
|
||||
#
|
||||
# Build zenith binaries
|
||||
#
|
||||
FROM alpine:20210212 as build
|
||||
RUN apk add --update rust cargo openssl-dev clang build-base
|
||||
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
|
||||
WORKDIR zenith
|
||||
COPY . .
|
||||
# Copy cached dependencies
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY --from=deps-build /zenith/target target
|
||||
COPY --from=deps-build /root/.cargo /root/.cargo
|
||||
RUN cargo build --release
|
||||
|
||||
#
|
||||
# Copy binaries to resulting image.
|
||||
# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure
|
||||
# out how to statically link rocksdb or avoid it at all).
|
||||
#
|
||||
FROM alpine:3.13
|
||||
RUN apk add --update openssl build-base
|
||||
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
|
||||
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/proxy /usr/local/bin
|
||||
COPY --from=pg-build /zenith/tmp_install /usr/local
|
||||
COPY docker-entrypoint.sh /docker-entrypoint.sh
|
||||
|
||||
RUN addgroup zenith && adduser -h /data -D -G zenith zenith
|
||||
VOLUME ["/data"]
|
||||
WORKDIR /data
|
||||
USER zenith
|
||||
EXPOSE 6400
|
||||
ENTRYPOINT ["/docker-entrypoint.sh"]
|
||||
CMD ["pageserver"]
|
||||
202
LICENSE
Normal file
202
LICENSE
Normal file
@@ -0,0 +1,202 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
68
Makefile
Normal file
68
Makefile
Normal file
@@ -0,0 +1,68 @@
|
||||
# Seccomp BPF is only available for Linux
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
SECCOMP = --with-libseccomp
|
||||
else
|
||||
SECCOMP =
|
||||
endif
|
||||
|
||||
#
|
||||
# Top level Makefile to build Zenith and PostgreSQL
|
||||
#
|
||||
all: zenith postgres
|
||||
|
||||
# We don't want to run 'cargo build' in parallel with the postgres build,
|
||||
# because interleaving cargo build output with postgres build output looks
|
||||
# confusing. Also, 'cargo build' is parallel on its own, so it would be too
|
||||
# much parallelism. (Recursive invocation of postgres target still gets any
|
||||
# '-j' flag from the command line, so 'make -j' is still useful.)
|
||||
.NOTPARALLEL:
|
||||
|
||||
### Zenith Rust bits
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
zenith: postgres-headers
|
||||
cargo build
|
||||
|
||||
### PostgreSQL parts
|
||||
tmp_install/build/config.status:
|
||||
+@echo "Configuring postgres build"
|
||||
mkdir -p tmp_install/build
|
||||
(cd tmp_install/build && \
|
||||
../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
|
||||
--enable-cassert \
|
||||
--enable-debug \
|
||||
--enable-depend \
|
||||
$(SECCOMP) \
|
||||
--prefix=$(abspath tmp_install) > configure.log)
|
||||
|
||||
# nicer alias for running 'configure'
|
||||
postgres-configure: tmp_install/build/config.status
|
||||
|
||||
# Install the PostgreSQL header files into tmp_install/include
|
||||
postgres-headers: postgres-configure
|
||||
+@echo "Installing PostgreSQL headers"
|
||||
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
|
||||
|
||||
|
||||
# Compile and install PostgreSQL and contrib/zenith
|
||||
postgres: postgres-configure
|
||||
+@echo "Compiling PostgreSQL"
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
|
||||
+@echo "Compiling contrib/zenith"
|
||||
$(MAKE) -C tmp_install/build/contrib/zenith install
|
||||
|
||||
postgres-clean:
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
clean:
|
||||
cd tmp_install/build && ${MAKE} clean
|
||||
cargo clean
|
||||
|
||||
# This removes everything
|
||||
distclean:
|
||||
rm -rf tmp_install
|
||||
cargo clean
|
||||
|
||||
.PHONY: postgres-configure postgres postgres-headers zenith
|
||||
1
Pipfile.lock
generated
Symbolic link
1
Pipfile.lock
generated
Symbolic link
@@ -0,0 +1 @@
|
||||
./test_runner/Pipfile.lock
|
||||
159
README.md
159
README.md
@@ -2,43 +2,166 @@
|
||||
|
||||
Zenith substitutes PostgreSQL storage layer and redistributes data across a cluster of nodes
|
||||
|
||||
## Running local installation
|
||||
|
||||
1. Install build dependencies and other useful packages
|
||||
|
||||
On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||
```text
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang
|
||||
```
|
||||
|
||||
[Rust] 1.48 or later is also required.
|
||||
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
|
||||
|
||||
To run the integration tests (not required to use the code), install
|
||||
Python (3.6 or higher), and install python3 packages with `pip` (called `pip3` on some systems):
|
||||
```
|
||||
pip install pytest psycopg2
|
||||
```
|
||||
|
||||
2. Build zenith and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/libzenith/zenith.git
|
||||
cd zenith
|
||||
make -j5
|
||||
```
|
||||
|
||||
3. Start pageserver and postgres on top of it (should be called from repo root):
|
||||
```sh
|
||||
# Create repository in .zenith with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
> ./target/debug/zenith init
|
||||
<...>
|
||||
new zenith repository was created in .zenith
|
||||
|
||||
# start pageserver
|
||||
> ./target/debug/zenith start
|
||||
Starting pageserver at '127.0.0.1:64000' in .zenith
|
||||
Pageserver started
|
||||
|
||||
# start postgres on top on the pageserver
|
||||
> ./target/debug/zenith pg start main
|
||||
Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
|
||||
waiting for server to start.... done
|
||||
|
||||
# check list of running postgres instances
|
||||
> ./target/debug/zenith pg list
|
||||
BRANCH ADDRESS LSN STATUS
|
||||
main 127.0.0.1:55432 0/1609610 running
|
||||
```
|
||||
|
||||
4. Now it is possible to connect to postgres and run some queries:
|
||||
```text
|
||||
> psql -p55432 -h 127.0.0.1 postgres
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
CREATE TABLE
|
||||
postgres=# insert into t values(1,1);
|
||||
INSERT 0 1
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
```
|
||||
|
||||
5. And create branches and run postgres on them:
|
||||
```sh
|
||||
# create branch named migration_check
|
||||
> ./target/debug/zenith branch migration_check main
|
||||
Created branch 'migration_check' at 0/1609610
|
||||
|
||||
# check branches tree
|
||||
> ./target/debug/zenith branch
|
||||
main
|
||||
┗━ @0/1609610: migration_check
|
||||
|
||||
# start postgres on that branch
|
||||
> ./target/debug/zenith pg start migration_check
|
||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
|
||||
waiting for server to start.... done
|
||||
|
||||
# this new postgres instance will have all the data from 'main' postgres,
|
||||
# but all modifications would not affect data in original postgres
|
||||
> psql -p55433 -h 127.0.0.1 postgres
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
|
||||
postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
```sh
|
||||
git clone --recursive https://github.com/libzenith/zenith.git
|
||||
./pgbuild.sh # builds postgres and installs it to ./tmp_install
|
||||
cargo test -- --test-threads=1
|
||||
make # builds also postgres and installs it to ./tmp_install
|
||||
cd test_runner
|
||||
pytest
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
Now we use README files to cover design ideas and overall architecture for each module.
|
||||
And rustdoc style documentation comments.
|
||||
|
||||
To view your documentation in a browser, try running `cargo doc --no-deps --open`
|
||||
|
||||
## Source tree layout
|
||||
|
||||
/walkeeper:
|
||||
`/control_plane`:
|
||||
|
||||
WAL safekeeper. Written in Rust.
|
||||
Local control plane.
|
||||
Functions to start, configure and stop pageserver and postgres instances running as a local processes.
|
||||
Intended to be used in integration tests and in CLI tools for local installations.
|
||||
|
||||
/pageserver:
|
||||
`/zenith`
|
||||
|
||||
Main entry point for the 'zenith' CLI utility.
|
||||
TODO: Doesn't it belong to control_plane?
|
||||
|
||||
`/postgres_ffi`:
|
||||
|
||||
Utility functions for interacting with PostgreSQL file formats.
|
||||
Misc constants, copied from PostgreSQL headers.
|
||||
|
||||
`/zenith_utils`:
|
||||
|
||||
Helpers that are shared between other crates in this repository.
|
||||
|
||||
`/walkeeper`:
|
||||
|
||||
WAL safekeeper (also known as WAL acceptor). Written in Rust.
|
||||
|
||||
`/pageserver`:
|
||||
|
||||
Page Server. Written in Rust.
|
||||
|
||||
Depends on the modified 'postgres' binary for WAL redo.
|
||||
|
||||
/integration_tests:
|
||||
|
||||
Tests with different combinations of a Postgres compute node, WAL safekeeper and Page Server.
|
||||
|
||||
/mgmt-console:
|
||||
|
||||
Web UI to launch (modified) Postgres servers, using S3 as the backing store. Written in Python.
|
||||
This is somewhat outdated, as it doesn't use the WAL safekeeper or Page Servers.
|
||||
|
||||
/vendor/postgres:
|
||||
`/vendor/postgres`:
|
||||
|
||||
PostgreSQL source tree, with the modifications needed for Zenith.
|
||||
|
||||
/vendor/postgres/src/bin/safekeeper:
|
||||
`/vendor/postgres/contrib/zenith`:
|
||||
|
||||
Extension (safekeeper_proxy) that runs in the compute node, and connects to the WAL safekeepers
|
||||
and streams the WAL
|
||||
PostgreSQL extension that implements storage manager API and network communications with remote page server.
|
||||
|
||||
`/test_runner`:
|
||||
|
||||
Integration tests, written in Python using the `pytest` framework.
|
||||
|
||||
`test_runner/zenith_regress`:
|
||||
|
||||
Quick way to add new SQL regression test to integration tests set.
|
||||
|
||||
`/integration_tests`:
|
||||
|
||||
Another pack of integration tests. Written in Rust.
|
||||
|
||||
[Rust]: https://www.rust-lang.org/learn/get-started
|
||||
|
||||
188
cli-v2-story.md
Normal file
188
cli-v2-story.md
Normal file
@@ -0,0 +1,188 @@
|
||||
Create a new Zenith repository in the current directory:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli init
|
||||
The files belonging to this database system will be owned by user "heikki".
|
||||
This user must also own the server process.
|
||||
|
||||
The database cluster will be initialized with locale "en_GB.UTF-8".
|
||||
The default database encoding has accordingly been set to "UTF8".
|
||||
The default text search configuration will be set to "english".
|
||||
|
||||
Data page checksums are disabled.
|
||||
|
||||
creating directory tmp ... ok
|
||||
creating subdirectories ... ok
|
||||
selecting dynamic shared memory implementation ... posix
|
||||
selecting default max_connections ... 100
|
||||
selecting default shared_buffers ... 128MB
|
||||
selecting default time zone ... Europe/Helsinki
|
||||
creating configuration files ... ok
|
||||
running bootstrap script ... ok
|
||||
performing post-bootstrap initialization ... ok
|
||||
syncing data to disk ... ok
|
||||
|
||||
initdb: warning: enabling "trust" authentication for local connections
|
||||
You can change this by editing pg_hba.conf or using the option -A, or
|
||||
--auth-local and --auth-host, the next time you run initdb.
|
||||
new zenith repository was created in .zenith
|
||||
|
||||
Initially, there is only one branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
|
||||
main
|
||||
|
||||
Start a local Postgres instance on the branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start main
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:27:43.919 EEST [984664] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv6 address "::1", port 5432
|
||||
2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv4 address "127.0.0.1", port 5432
|
||||
2021-04-13 09:27:43.927 EEST [984664] LOG: listening on Unix socket "/tmp/.s.PGSQL.5432"
|
||||
2021-04-13 09:27:43.939 EEST [984665] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:27:43.939 EEST [984665] LOG: creating missing WAL directory "pg_wal/archive_status"
|
||||
2021-04-13 09:27:44.189 EEST [984665] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:27:44.195 EEST [984665] LOG: invalid record length at 0/15FFB80: wanted 24, got 0
|
||||
2021-04-13 09:27:44.195 EEST [984665] LOG: redo is not required
|
||||
2021-04-13 09:27:44.225 EEST [984664] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
|
||||
Run some commands against it:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "create table foo (t text);"
|
||||
CREATE TABLE
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "insert into foo values ('inserted on the main branch');"
|
||||
INSERT 0 1
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
Create a new branch called 'experimental'. We create it from the
|
||||
current end of the 'main' branch, but you could specify a different
|
||||
LSN as the start point instead.
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch experimental main
|
||||
branching at end of WAL: 0/161F478
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
|
||||
experimental
|
||||
main
|
||||
|
||||
Start another Postgres instance off the 'experimental' branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:28:41.874 EEST [984766] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv6 address "::1", port 5433
|
||||
2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv4 address "127.0.0.1", port 5433
|
||||
2021-04-13 09:28:41.883 EEST [984766] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433"
|
||||
2021-04-13 09:28:41.896 EEST [984767] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:28:42.265 EEST [984767] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:28:42.269 EEST [984767] LOG: redo starts at 0/15FFB80
|
||||
2021-04-13 09:28:42.272 EEST [984767] LOG: invalid record length at 0/161F4B0: wanted 24, got 0
|
||||
2021-04-13 09:28:42.272 EEST [984767] LOG: redo done at 0/161F478 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
|
||||
2021-04-13 09:28:42.321 EEST [984766] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
|
||||
Insert some a row on the 'experimental' branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "insert into foo values ('inserted on experimental')"
|
||||
INSERT 0 1
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
inserted on experimental
|
||||
(2 rows)
|
||||
|
||||
See that the other Postgres instance is still running on 'main' branch on port 5432:
|
||||
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5432 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
|
||||
|
||||
|
||||
Everything is stored in the .zenith directory:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/
|
||||
total 12
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 datadirs
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 refs
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 timelines
|
||||
|
||||
The 'datadirs' directory contains the datadirs of the running instances:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/
|
||||
total 8
|
||||
drwx------ 18 heikki heikki 4096 Apr 13 09:27 3c0c634c1674079b2c6d4edf7c91523e
|
||||
drwx------ 18 heikki heikki 4096 Apr 13 09:28 697e3c103d4b1763cd6e82e4ff361d76
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/3c0c634c1674079b2c6d4edf7c91523e/
|
||||
total 124
|
||||
drwxr-xr-x 5 heikki heikki 4096 Apr 13 09:27 base
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 global
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_commit_ts
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_dynshmem
|
||||
-rw------- 1 heikki heikki 4760 Apr 13 09:27 pg_hba.conf
|
||||
-rw------- 1 heikki heikki 1636 Apr 13 09:27 pg_ident.conf
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:32 pg_logical
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 pg_multixact
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_notify
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_replslot
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_serial
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_snapshots
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_stat
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:34 pg_stat_tmp
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_subtrans
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_tblspc
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_twophase
|
||||
-rw------- 1 heikki heikki 3 Apr 13 09:27 PG_VERSION
|
||||
lrwxrwxrwx 1 heikki heikki 52 Apr 13 09:27 pg_wal -> ../../timelines/3c0c634c1674079b2c6d4edf7c91523e/wal
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_xact
|
||||
-rw------- 1 heikki heikki 88 Apr 13 09:27 postgresql.auto.conf
|
||||
-rw------- 1 heikki heikki 28688 Apr 13 09:27 postgresql.conf
|
||||
-rw------- 1 heikki heikki 96 Apr 13 09:27 postmaster.opts
|
||||
-rw------- 1 heikki heikki 149 Apr 13 09:27 postmaster.pid
|
||||
|
||||
Note how 'pg_wal' is just a symlink to the 'timelines' directory. The
|
||||
datadir is ephemeral, you can delete it at any time, and it can be reconstructed
|
||||
from the snapshots and WAL stored in the 'timelines' directory. So if you push/pull
|
||||
the repository, the 'datadirs' are not included. (They are like git working trees)
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ killall -9 postgres
|
||||
~/git-sandbox/zenith (cli-v2)$ rm -rf .zenith/datadirs/*
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:37:05.476 EEST [985340] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv6 address "::1", port 5433
|
||||
2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv4 address "127.0.0.1", port 5433
|
||||
2021-04-13 09:37:05.487 EEST [985340] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433"
|
||||
2021-04-13 09:37:05.498 EEST [985341] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:37:05.808 EEST [985341] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:37:05.813 EEST [985341] LOG: redo starts at 0/15FFB80
|
||||
2021-04-13 09:37:05.815 EEST [985341] LOG: invalid record length at 0/161F770: wanted 24, got 0
|
||||
2021-04-13 09:37:05.815 EEST [985341] LOG: redo done at 0/161F738 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
|
||||
2021-04-13 09:37:05.866 EEST [985340] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
inserted on experimental
|
||||
(2 rows)
|
||||
|
||||
1
control_plane/.gitignore
vendored
Normal file
1
control_plane/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
tmp_check/
|
||||
27
control_plane/Cargo.toml
Normal file
27
control_plane/Cargo.toml
Normal file
@@ -0,0 +1,27 @@
|
||||
[package]
|
||||
name = "control_plane"
|
||||
version = "0.1.0"
|
||||
authors = ["Stas Kelvich <stas@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
rand = "0.8.3"
|
||||
tar = "0.4.33"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
toml = "0.5"
|
||||
lazy_static = "1.4"
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
bytes = "1.0.1"
|
||||
nix = "0.20"
|
||||
url = "2.2.2"
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
415
control_plane/src/compute.rs
Normal file
415
control_plane/src/compute.rs
Normal file
@@ -0,0 +1,415 @@
|
||||
use std::io::Write;
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::process::Command;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{collections::BTreeMap, path::PathBuf};
|
||||
use std::{
|
||||
fs::{self, OpenOptions},
|
||||
io::Read,
|
||||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
use crate::storage::PageServerNode;
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
//
|
||||
pub struct ComputeControlPlane {
|
||||
base_port: u16,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
pub nodes: BTreeMap<String, Arc<PostgresNode>>,
|
||||
env: LocalEnv,
|
||||
}
|
||||
|
||||
impl ComputeControlPlane {
|
||||
// Load current nodes with ports from data directories on disk
|
||||
pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
|
||||
// TODO: since pageserver do not have config file yet we believe here that
|
||||
// it is running on default port. Change that when pageserver will have config.
|
||||
let pageserver = Arc::new(PageServerNode::from_env(&env));
|
||||
|
||||
let pgdatadirspath = &env.pg_data_dirs_path();
|
||||
let nodes: Result<BTreeMap<_, _>> = fs::read_dir(&pgdatadirspath)
|
||||
.with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
|
||||
.into_iter()
|
||||
.map(|f| {
|
||||
PostgresNode::from_dir_entry(f?, &env, &pageserver)
|
||||
.map(|node| (node.name.clone(), Arc::new(node)))
|
||||
})
|
||||
.collect();
|
||||
let nodes = nodes?;
|
||||
|
||||
Ok(ComputeControlPlane {
|
||||
base_port: 55431,
|
||||
pageserver,
|
||||
nodes,
|
||||
env,
|
||||
})
|
||||
}
|
||||
|
||||
fn get_port(&mut self) -> u16 {
|
||||
1 + self
|
||||
.nodes
|
||||
.iter()
|
||||
.map(|(_name, node)| node.address.port())
|
||||
.max()
|
||||
.unwrap_or(self.base_port)
|
||||
}
|
||||
|
||||
pub fn local(local_env: &LocalEnv, pageserver: &Arc<PageServerNode>) -> ComputeControlPlane {
|
||||
ComputeControlPlane {
|
||||
base_port: 65431,
|
||||
pageserver: Arc::clone(pageserver),
|
||||
nodes: BTreeMap::new(),
|
||||
env: local_env.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Connect to a page server, get base backup, and untar it to initialize a
|
||||
/// new data directory
|
||||
pub fn new_from_page_server(
|
||||
&mut self,
|
||||
is_test: bool,
|
||||
timelineid: ZTimelineId,
|
||||
name: &str,
|
||||
) -> Result<Arc<PostgresNode>> {
|
||||
let node = Arc::new(PostgresNode {
|
||||
name: name.to_owned(),
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
|
||||
env: self.env.clone(),
|
||||
pageserver: Arc::clone(&self.pageserver),
|
||||
is_test,
|
||||
timelineid,
|
||||
});
|
||||
|
||||
node.init_from_page_server()?;
|
||||
self.nodes.insert(node.name.clone(), Arc::clone(&node));
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
|
||||
pub fn new_node(&mut self, branch_name: &str) -> Result<Arc<PostgresNode>> {
|
||||
let timeline_id = self.pageserver.branch_get_by_name(branch_name)?.timeline_id;
|
||||
|
||||
let node = self.new_from_page_server(false, timeline_id, branch_name)?;
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
concat!(
|
||||
"shared_preload_libraries = zenith\n",
|
||||
"synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
|
||||
"zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
),
|
||||
node.connstr()
|
||||
)
|
||||
.as_str(),
|
||||
)?;
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct PostgresNode {
|
||||
pub address: SocketAddr,
|
||||
name: String,
|
||||
pub env: LocalEnv,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
is_test: bool,
|
||||
pub timelineid: ZTimelineId,
|
||||
}
|
||||
|
||||
impl PostgresNode {
|
||||
fn from_dir_entry(
|
||||
entry: std::fs::DirEntry,
|
||||
env: &LocalEnv,
|
||||
pageserver: &Arc<PageServerNode>,
|
||||
) -> Result<PostgresNode> {
|
||||
if !entry.file_type()?.is_dir() {
|
||||
anyhow::bail!(
|
||||
"PostgresNode::from_dir_entry failed: '{}' is not a directory",
|
||||
entry.path().display()
|
||||
);
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref CONF_PORT_RE: Regex = Regex::new(r"(?m)^\s*port\s*=\s*(\d+)\s*$").unwrap();
|
||||
static ref CONF_TIMELINE_RE: Regex =
|
||||
Regex::new(r"(?m)^\s*zenith.zenith_timeline\s*=\s*'(\w+)'\s*$").unwrap();
|
||||
}
|
||||
|
||||
// parse data directory name
|
||||
let fname = entry.file_name();
|
||||
let name = fname.to_str().unwrap().to_string();
|
||||
|
||||
// find out tcp port in config file
|
||||
let cfg_path = entry.path().join("postgresql.conf");
|
||||
let config = fs::read_to_string(cfg_path.clone()).with_context(|| {
|
||||
format!(
|
||||
"failed to read config file in {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
)
|
||||
})?;
|
||||
|
||||
// parse port
|
||||
let err_msg = format!(
|
||||
"failed to find port definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let port: u16 = CONF_PORT_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
// parse timeline
|
||||
let err_msg = format!(
|
||||
"failed to find timeline definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let timelineid: ZTimelineId = CONF_TIMELINE_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
// ok now
|
||||
Ok(PostgresNode {
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
|
||||
name,
|
||||
env: env.clone(),
|
||||
pageserver: Arc::clone(pageserver),
|
||||
is_test: false,
|
||||
timelineid,
|
||||
})
|
||||
}
|
||||
|
||||
// Connect to a page server, get base backup, and untar it to initialize a
|
||||
// new data directory
|
||||
pub fn init_from_page_server(&self) -> Result<()> {
|
||||
let pgdata = self.pgdata();
|
||||
|
||||
println!(
|
||||
"Extracting base backup to create postgres instance: path={} port={}",
|
||||
pgdata.display(),
|
||||
self.address.port()
|
||||
);
|
||||
|
||||
// initialize data directory
|
||||
if self.is_test {
|
||||
fs::remove_dir_all(&pgdata).ok();
|
||||
}
|
||||
|
||||
let sql = format!("basebackup {}", self.timelineid);
|
||||
let mut client = self
|
||||
.pageserver
|
||||
.page_server_psql_client()
|
||||
.with_context(|| "connecting to page server failed")?;
|
||||
|
||||
fs::create_dir_all(&pgdata)
|
||||
.with_context(|| format!("could not create data directory {}", pgdata.display()))?;
|
||||
fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"could not set permissions in data directory {}",
|
||||
pgdata.display()
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
// FIXME: The compute node should be able to stream the WAL it needs from the WAL safekeepers or archive.
|
||||
// But that's not implemented yet. For now, 'pg_wal' is included in the base backup tarball that
|
||||
// we receive from the Page Server, so we don't need to create the empty 'pg_wal' directory here.
|
||||
//fs::create_dir_all(pgdata.join("pg_wal"))?;
|
||||
|
||||
let mut copyreader = client
|
||||
.copy_out(sql.as_str())
|
||||
.with_context(|| "page server 'basebackup' command failed")?;
|
||||
|
||||
// FIXME: Currently, we slurp the whole tarball into memory, and then extract it,
|
||||
// but we really should do this:
|
||||
//let mut ar = tar::Archive::new(copyreader);
|
||||
let mut buf = vec![];
|
||||
copyreader
|
||||
.read_to_end(&mut buf)
|
||||
.with_context(|| "reading base backup from page server failed")?;
|
||||
let mut ar = tar::Archive::new(buf.as_slice());
|
||||
ar.unpack(&pgdata)
|
||||
.with_context(|| "extracting page backup failed")?;
|
||||
|
||||
// wal_log_hints is mandatory when running against pageserver (see gh issue#192)
|
||||
// TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"max_wal_senders = 10\n\
|
||||
wal_log_hints = on\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
fsync = off\n\
|
||||
max_connections = 100\n\
|
||||
wal_sender_timeout = 0\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n",
|
||||
address = self.address.ip(),
|
||||
port = self.address.port()
|
||||
),
|
||||
)?;
|
||||
|
||||
// Never clean up old WAL. TODO: We should use a replication
|
||||
// slot or something proper, to prevent the compute node
|
||||
// from removing WAL that hasn't been streamed to the safekeepr or
|
||||
// page server yet. But this will do for now.
|
||||
self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?;
|
||||
|
||||
// Connect it to the page server.
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
let (host, port) = connection_host_port(&self.pageserver.connection_config());
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"shared_preload_libraries = zenith \n\
|
||||
zenith.page_server_connstring = 'host={} port={}'\n\
|
||||
zenith.zenith_timeline='{}'\n",
|
||||
host, port, self.timelineid
|
||||
),
|
||||
)?;
|
||||
|
||||
fs::create_dir_all(self.pgdata().join("pg_wal"))?;
|
||||
fs::create_dir_all(self.pgdata().join("pg_wal").join("archive_status"))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn pgdata(&self) -> PathBuf {
|
||||
self.env.pg_data_dir(&self.name)
|
||||
}
|
||||
|
||||
pub fn status(&self) -> &str {
|
||||
let timeout = Duration::from_millis(300);
|
||||
let has_pidfile = self.pgdata().join("postmaster.pid").exists();
|
||||
let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();
|
||||
|
||||
match (has_pidfile, can_connect) {
|
||||
(true, true) => "running",
|
||||
(false, false) => "stopped",
|
||||
(true, false) => "crashed",
|
||||
(false, true) => "running, no pidfile",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append_conf(&self, config: &str, opts: &str) -> Result<()> {
|
||||
OpenOptions::new()
|
||||
.append(true)
|
||||
.open(self.pgdata().join(config).to_str().unwrap())?
|
||||
.write_all(opts.as_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pg_ctl(&self, args: &[&str]) -> Result<()> {
|
||||
let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl");
|
||||
|
||||
let pg_ctl = Command::new(pg_ctl_path)
|
||||
.args(
|
||||
[
|
||||
&[
|
||||
"-D",
|
||||
self.pgdata().to_str().unwrap(),
|
||||
"-l",
|
||||
self.pgdata().join("pg.log").to_str().unwrap(),
|
||||
"-w", //wait till pg_ctl actually does what was asked
|
||||
],
|
||||
args,
|
||||
]
|
||||
.concat(),
|
||||
)
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.with_context(|| "pg_ctl failed")?;
|
||||
if !pg_ctl.success() {
|
||||
anyhow::bail!("pg_ctl failed");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Result<()> {
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
self.pg_ctl(&["start"])
|
||||
}
|
||||
|
||||
pub fn restart(&self) -> Result<()> {
|
||||
self.pg_ctl(&["restart"])
|
||||
}
|
||||
|
||||
pub fn stop(&self, destroy: bool) -> Result<()> {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"])?;
|
||||
if destroy {
|
||||
println!(
|
||||
"Destroying postgres data directory '{}'",
|
||||
self.pgdata().to_str().unwrap()
|
||||
);
|
||||
fs::remove_dir_all(&self.pgdata())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn connstr(&self) -> String {
|
||||
format!(
|
||||
"host={} port={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
self.whoami()
|
||||
)
|
||||
}
|
||||
|
||||
// XXX: cache that in control plane
|
||||
pub fn whoami(&self) -> String {
|
||||
let output = Command::new("whoami")
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PostgresNode {
|
||||
// destructor to clean up state after test is done
|
||||
// XXX: we may detect failed test by setting some flag in catch_unwind()
|
||||
// and checking it here. But let just clean datadirs on start.
|
||||
fn drop(&mut self) {
|
||||
if self.is_test {
|
||||
let _ = self.stop(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
31
control_plane/src/lib.rs
Normal file
31
control_plane/src/lib.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
//
|
||||
// Local control plane.
|
||||
//
|
||||
// Can start, configure and stop postgres instances running as a local processes.
|
||||
//
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
pub mod compute;
|
||||
pub mod local_env;
|
||||
pub mod storage;
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// We expect a file that contains a single integer.
|
||||
/// We return an i32 for compatibility with libc and nix.
|
||||
pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
|
||||
let pid_str = fs::read_to_string(pidfile)
|
||||
.with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
|
||||
let pid: i32 = pid_str
|
||||
.parse()
|
||||
.map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
|
||||
if pid < 1 {
|
||||
bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
|
||||
}
|
||||
Ok(pid)
|
||||
}
|
||||
166
control_plane/src/local_env.rs
Normal file
166
control_plane/src/local_env.rs
Normal file
@@ -0,0 +1,166 @@
|
||||
//
|
||||
// This module is responsible for locating and loading paths in a local setup.
|
||||
//
|
||||
// Now it also provides init method which acts like a stub for proper installation
|
||||
// script which will use local paths.
|
||||
//
|
||||
use anyhow::{anyhow, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::{collections::BTreeMap, env};
|
||||
use url::Url;
|
||||
|
||||
pub type Remotes = BTreeMap<String, String>;
|
||||
|
||||
//
|
||||
// This data structures represent deserialized zenith CLI config
|
||||
//
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct LocalEnv {
|
||||
// Pageserver connection strings
|
||||
pub pageserver_connstring: String,
|
||||
|
||||
// Base directory for both pageserver and compute nodes
|
||||
pub base_data_dir: PathBuf,
|
||||
|
||||
// Path to postgres distribution. It's expected that "bin", "include",
|
||||
// "lib", "share" from postgres distribution are there. If at some point
|
||||
// in time we will be able to run against vanilla postgres we may split that
|
||||
// to four separate paths and match OS-specific installation layout.
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
|
||||
// Path to pageserver binary. Empty for remote pageserver.
|
||||
pub zenith_distrib_dir: Option<PathBuf>,
|
||||
|
||||
pub remotes: Remotes,
|
||||
}
|
||||
|
||||
impl LocalEnv {
|
||||
// postgres installation paths
|
||||
pub fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
}
|
||||
pub fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> Result<PathBuf> {
|
||||
Ok(self
|
||||
.zenith_distrib_dir
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("Can not manage remote pageserver"))?
|
||||
.join("pageserver"))
|
||||
}
|
||||
|
||||
pub fn pg_data_dirs_path(&self) -> PathBuf {
|
||||
self.base_data_dir.join("pgdatadirs")
|
||||
}
|
||||
|
||||
pub fn pg_data_dir(&self, name: &str) -> PathBuf {
|
||||
self.pg_data_dirs_path().join(name)
|
||||
}
|
||||
|
||||
// TODO: move pageserver files into ./pageserver
|
||||
pub fn pageserver_data_dir(&self) -> PathBuf {
|
||||
self.base_data_dir.clone()
|
||||
}
|
||||
}
|
||||
|
||||
fn base_path() -> PathBuf {
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Initialize a new Zenith repository
|
||||
//
|
||||
pub fn init(remote_pageserver: Option<&str>) -> Result<()> {
|
||||
// check if config already exists
|
||||
let base_path = base_path();
|
||||
if base_path.exists() {
|
||||
anyhow::bail!(
|
||||
"{} already exists. Perhaps already initialized?",
|
||||
base_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// ok, now check that expected binaries are present
|
||||
|
||||
// Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
|
||||
let pg_distrib_dir: PathBuf = {
|
||||
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
|
||||
postgres_bin.into()
|
||||
} else {
|
||||
let cwd = env::current_dir()?;
|
||||
cwd.join("tmp_install")
|
||||
}
|
||||
};
|
||||
if !pg_distrib_dir.join("bin/postgres").exists() {
|
||||
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
|
||||
}
|
||||
|
||||
fs::create_dir(&base_path)?;
|
||||
fs::create_dir(base_path.join("pgdatadirs"))?;
|
||||
|
||||
let conf = if let Some(addr) = remote_pageserver {
|
||||
// check that addr is parsable
|
||||
let _uri = Url::parse(addr).map_err(|e| anyhow!("{}: {}", addr, e))?;
|
||||
|
||||
LocalEnv {
|
||||
pageserver_connstring: format!("postgresql://{}/", addr),
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir: None,
|
||||
base_data_dir: base_path,
|
||||
remotes: BTreeMap::default(),
|
||||
}
|
||||
} else {
|
||||
// Find zenith binaries.
|
||||
let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
if !zenith_distrib_dir.join("pageserver").exists() {
|
||||
anyhow::bail!("Can't find pageserver binary.",);
|
||||
}
|
||||
|
||||
LocalEnv {
|
||||
pageserver_connstring: "postgresql://127.0.0.1:6400".to_string(),
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir: Some(zenith_distrib_dir),
|
||||
base_data_dir: base_path,
|
||||
remotes: BTreeMap::default(),
|
||||
}
|
||||
};
|
||||
|
||||
let toml = toml::to_string_pretty(&conf)?;
|
||||
fs::write(conf.base_data_dir.join("config"), toml)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Locate and load config
|
||||
pub fn load_config() -> Result<LocalEnv> {
|
||||
let repopath = base_path();
|
||||
|
||||
if !repopath.exists() {
|
||||
anyhow::bail!(
|
||||
"Zenith config is not found in {}. You need to run 'zenith init' first",
|
||||
repopath.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// TODO: check that it looks like a zenith repository
|
||||
|
||||
// load and parse file
|
||||
let config = fs::read_to_string(repopath.join("config"))?;
|
||||
toml::from_str(config.as_str()).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
// Save config. We use that to change set of remotes from CLI itself.
|
||||
pub fn save_config(conf: &LocalEnv) -> Result<()> {
|
||||
let config_path = base_path().join("config");
|
||||
let conf_str = toml::to_string_pretty(conf)?;
|
||||
|
||||
fs::write(config_path, conf_str)?;
|
||||
Ok(())
|
||||
}
|
||||
215
control_plane/src/storage.rs
Normal file
215
control_plane/src/storage.rs
Normal file
@@ -0,0 +1,215 @@
|
||||
use std::collections::HashMap;
|
||||
use std::net::TcpStream;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use postgres::{Config, NoTls};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::read_pidfile;
|
||||
use pageserver::branches::BranchInfo;
|
||||
use zenith_utils::connstring::connection_address;
|
||||
|
||||
//
|
||||
// Control routines for pageserver.
|
||||
//
|
||||
// Used in CLI and tests.
|
||||
//
|
||||
pub struct PageServerNode {
|
||||
pub kill_on_exit: bool,
|
||||
pub connection_config: Option<Config>,
|
||||
pub env: LocalEnv,
|
||||
}
|
||||
|
||||
impl PageServerNode {
|
||||
pub fn from_env(env: &LocalEnv) -> PageServerNode {
|
||||
PageServerNode {
|
||||
kill_on_exit: false,
|
||||
connection_config: None, // default
|
||||
env: env.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn default_config() -> Config {
|
||||
"postgresql://no_user@localhost:64000/no_db"
|
||||
.parse()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn connection_config(&self) -> Config {
|
||||
match &self.connection_config {
|
||||
Some(config) => config.clone(),
|
||||
None => Self::default_config(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init(&self) -> Result<()> {
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let status = cmd
|
||||
.args(&[
|
||||
"--init",
|
||||
"-D",
|
||||
self.env.base_data_dir.to_str().unwrap(),
|
||||
"--postgres-distrib",
|
||||
self.env.pg_distrib_dir.to_str().unwrap(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.status()
|
||||
.expect("pageserver init failed");
|
||||
|
||||
if status.success() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(anyhow!("pageserver init failed"))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn repo_path(&self) -> PathBuf {
|
||||
self.env.pageserver_data_dir()
|
||||
}
|
||||
|
||||
pub fn pid_file(&self) -> PathBuf {
|
||||
self.repo_path().join("pageserver.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Result<()> {
|
||||
println!(
|
||||
"Starting pageserver at '{}' in {}",
|
||||
connection_address(&self.connection_config()),
|
||||
self.repo_path().display()
|
||||
);
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
cmd.args(&["-D", self.repo_path().to_str().unwrap()])
|
||||
.arg("-d")
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1");
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Pageserver failed to start. See '{}' for details.",
|
||||
self.repo_path().join("pageserver.log").display()
|
||||
);
|
||||
}
|
||||
|
||||
// It takes a while for the page server to start up. Wait until it is
|
||||
// open for business.
|
||||
for retries in 1..15 {
|
||||
let client = self.page_server_psql_client();
|
||||
if client.is_ok() {
|
||||
break;
|
||||
} else {
|
||||
println!("Pageserver not responding yet, retrying ({})...", retries);
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
|
||||
println!("Pageserver started");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
let pid = read_pidfile(&self.pid_file())?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to kill pageserver with pid {}", pid);
|
||||
}
|
||||
|
||||
// wait for pageserver stop
|
||||
let address = connection_address(&self.connection_config());
|
||||
for _ in 0..5 {
|
||||
let stream = TcpStream::connect(&address);
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
if let Err(_e) = stream {
|
||||
println!("Pageserver stopped");
|
||||
return Ok(());
|
||||
}
|
||||
println!("Stopping pageserver on {}", address);
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let mut client = self.connection_config().connect(NoTls).unwrap();
|
||||
|
||||
println!("Pageserver query: '{}'", sql);
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
|
||||
pub fn page_server_psql_client(&self) -> Result<postgres::Client, postgres::Error> {
|
||||
self.connection_config().connect(NoTls)
|
||||
}
|
||||
|
||||
pub fn branches_list(&self) -> Result<Vec<BranchInfo>> {
|
||||
let mut client = self.page_server_psql_client()?;
|
||||
let query_result = client.simple_query("branch_list")?;
|
||||
let branches_json = query_result
|
||||
.first()
|
||||
.map(|msg| match msg {
|
||||
postgres::SimpleQueryMessage::Row(row) => row.get(0),
|
||||
_ => None,
|
||||
})
|
||||
.flatten()
|
||||
.ok_or_else(|| anyhow!("missing branches"))?;
|
||||
|
||||
let res: Vec<BranchInfo> = serde_json::from_str(branches_json)?;
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub fn branch_create(&self, name: &str, startpoint: &str) -> Result<BranchInfo> {
|
||||
let mut client = self.page_server_psql_client()?;
|
||||
let query_result =
|
||||
client.simple_query(format!("branch_create {} {}", name, startpoint).as_str())?;
|
||||
|
||||
let branch_json = query_result
|
||||
.first()
|
||||
.map(|msg| match msg {
|
||||
postgres::SimpleQueryMessage::Row(row) => row.get(0),
|
||||
_ => None,
|
||||
})
|
||||
.flatten()
|
||||
.ok_or_else(|| anyhow!("missing branch"))?;
|
||||
|
||||
let res: BranchInfo = serde_json::from_str(branch_json).map_err(|e| {
|
||||
anyhow!(
|
||||
"failed to parse branch_create response: {}: {}",
|
||||
branch_json,
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
// TODO: make this a separate request type and avoid loading all the branches
|
||||
pub fn branch_get_by_name(&self, name: &str) -> Result<BranchInfo> {
|
||||
let branch_infos = self.branches_list()?;
|
||||
let branche_by_name: Result<HashMap<String, BranchInfo>> = branch_infos
|
||||
.into_iter()
|
||||
.map(|branch_info| Ok((branch_info.name.clone(), branch_info)))
|
||||
.collect();
|
||||
let branche_by_name = branche_by_name?;
|
||||
|
||||
let branch = branche_by_name
|
||||
.get(name)
|
||||
.ok_or_else(|| anyhow!("Branch {} not found", name))?;
|
||||
|
||||
Ok(branch.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
fn drop(&mut self) {
|
||||
if self.kill_on_exit {
|
||||
let _ = self.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
11
docker-entrypoint.sh
Executable file
11
docker-entrypoint.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/sh
|
||||
if [ "$1" = 'pageserver' ]; then
|
||||
if [ ! -d "/data/timelines" ]; then
|
||||
echo "Initializing pageserver data directory"
|
||||
pageserver --init -D /data --postgres-distrib /usr/local
|
||||
fi
|
||||
echo "Staring pageserver at 0.0.0.0:6400"
|
||||
pageserver -l 0.0.0.0:6400 -D /data
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
@@ -1,16 +0,0 @@
|
||||
[package]
|
||||
name = "integration_tests"
|
||||
version = "0.1.0"
|
||||
authors = ["Stas Kelvich <stas@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
lazy_static = "1.4.0"
|
||||
rand = "0.8.3"
|
||||
postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
tokio-postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
@@ -1,663 +0,0 @@
|
||||
//
|
||||
// Local control plane.
|
||||
//
|
||||
// Can start, cofigure and stop postgres instances running as a local processes.
|
||||
//
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
|
||||
use std::fs::File;
|
||||
use std::fs::{self, OpenOptions};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use std::{
|
||||
io::Write,
|
||||
net::{IpAddr, Ipv4Addr, SocketAddr},
|
||||
};
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
lazy_static! {
|
||||
// postgres would be there if it was build by 'make postgres' here in the repo
|
||||
pub static ref PG_BIN_DIR : PathBuf = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_install/bin");
|
||||
pub static ref PG_LIB_DIR : PathBuf = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_install/lib");
|
||||
|
||||
pub static ref BIN_DIR : PathBuf = cargo_bin_dir();
|
||||
|
||||
pub static ref TEST_WORKDIR : PathBuf = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tmp_check");
|
||||
}
|
||||
|
||||
// Find the directory where the binaries were put (i.e. target/debug/)
|
||||
pub fn cargo_bin_dir() -> PathBuf {
|
||||
let mut pathbuf = std::env::current_exe().ok().unwrap();
|
||||
|
||||
pathbuf.pop();
|
||||
if pathbuf.ends_with("deps") {
|
||||
pathbuf.pop();
|
||||
}
|
||||
|
||||
return pathbuf;
|
||||
}
|
||||
|
||||
//
|
||||
// I'm intendedly modelling storage and compute control planes as a separate entities
|
||||
// as it is closer to the actual setup.
|
||||
//
|
||||
pub struct StorageControlPlane {
|
||||
pub wal_acceptors: Vec<WalAcceptorNode>,
|
||||
pub page_servers: Vec<PageServerNode>,
|
||||
}
|
||||
|
||||
impl StorageControlPlane {
|
||||
// postgres <-> page_server
|
||||
pub fn one_page_server() -> StorageControlPlane {
|
||||
let mut cplane = StorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
page_servers: Vec::new(),
|
||||
};
|
||||
|
||||
let pserver = PageServerNode {
|
||||
page_service_addr: "127.0.0.1:65200".parse().unwrap(),
|
||||
data_dir: TEST_WORKDIR.join("pageserver"),
|
||||
};
|
||||
pserver.init();
|
||||
pserver.start();
|
||||
|
||||
cplane.page_servers.push(pserver);
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn fault_tolerant(redundancy: usize) -> StorageControlPlane {
|
||||
let mut cplane = StorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
page_servers: Vec::new(),
|
||||
};
|
||||
const WAL_ACCEPTOR_PORT: usize = 54321;
|
||||
|
||||
for i in 0..redundancy {
|
||||
let wal_acceptor = WalAcceptorNode {
|
||||
listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
|
||||
.parse()
|
||||
.unwrap(),
|
||||
data_dir: TEST_WORKDIR.join(format!("wal_acceptor_{}", i)),
|
||||
};
|
||||
wal_acceptor.init();
|
||||
wal_acceptor.start();
|
||||
cplane.wal_acceptors.push(wal_acceptor);
|
||||
}
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
for wa in self.wal_acceptors.iter() {
|
||||
wa.stop();
|
||||
}
|
||||
}
|
||||
|
||||
// // postgres <-> wal_acceptor x3 <-> page_server
|
||||
// fn local(&mut self) -> StorageControlPlane {
|
||||
// }
|
||||
|
||||
pub fn page_server_addr(&self) -> &SocketAddr {
|
||||
&self.page_servers[0].page_service_addr
|
||||
}
|
||||
|
||||
pub fn get_wal_acceptor_conn_info(&self) -> String {
|
||||
self.wal_acceptors
|
||||
.iter()
|
||||
.map(|wa| wa.listen.to_string().to_string())
|
||||
.collect::<Vec<String>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let addr = &self.page_servers[0].page_service_addr;
|
||||
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
addr.ip(),
|
||||
addr.port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Pageserver query: '{}'", sql);
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for StorageControlPlane {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PageServerNode {
|
||||
page_service_addr: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl PageServerNode {
|
||||
// TODO: method to force redo on a specific relation
|
||||
|
||||
// TODO: make wal-redo-postgres workable without data directory?
|
||||
pub fn init(&self) {
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
|
||||
let datadir_path = self.data_dir.join("wal_redo_pgdata");
|
||||
fs::remove_dir_all(datadir_path.to_str().unwrap()).ok();
|
||||
|
||||
let initdb = Command::new(PG_BIN_DIR.join("initdb"))
|
||||
.args(&["-D", datadir_path.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute initdb");
|
||||
if !initdb.success() {
|
||||
panic!("initdb failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!("Starting pageserver at '{}'", self.page_service_addr);
|
||||
|
||||
let status = Command::new(BIN_DIR.join("pageserver"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.page_service_addr.to_string().as_str()])
|
||||
.arg("-d")
|
||||
.arg("--skip-recovery")
|
||||
.env_clear()
|
||||
.env("PATH", PG_BIN_DIR.to_str().unwrap()) // path to postres-wal-redo binary
|
||||
.status()
|
||||
.expect("failed to start pageserver");
|
||||
|
||||
if !status.success() {
|
||||
panic!("pageserver start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
let pidfile = self.data_dir.join("pageserver.pid");
|
||||
let pid = fs::read_to_string(pidfile).unwrap();
|
||||
let status = Command::new("kill")
|
||||
.arg(pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
panic!("kill start failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
// fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WalAcceptorNode {
|
||||
listen: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl WalAcceptorNode {
|
||||
pub fn init(&self) {
|
||||
if self.data_dir.exists() {
|
||||
fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!(
|
||||
"Starting wal_acceptor in {} listening '{}'",
|
||||
self.data_dir.to_str().unwrap(),
|
||||
self.listen
|
||||
);
|
||||
|
||||
let status = Command::new(BIN_DIR.join("wal_acceptor"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.listen.to_string().as_str()])
|
||||
.arg("-d")
|
||||
.arg("-n")
|
||||
.status()
|
||||
.expect("failed to start wal_acceptor");
|
||||
|
||||
if !status.success() {
|
||||
panic!("wal_acceptor start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
let pidfile = self.data_dir.join("wal_acceptor.pid");
|
||||
if let Ok(pid) = fs::read_to_string(pidfile) {
|
||||
let _status = Command::new("kill")
|
||||
.arg(pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalAcceptorNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
// fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
//
|
||||
pub struct ComputeControlPlane<'a> {
|
||||
pg_bin_dir: PathBuf,
|
||||
work_dir: PathBuf,
|
||||
last_assigned_port: u16,
|
||||
storage_cplane: &'a StorageControlPlane,
|
||||
nodes: Vec<Arc<PostgresNode>>,
|
||||
}
|
||||
|
||||
impl ComputeControlPlane<'_> {
|
||||
pub fn local(storage_cplane: &StorageControlPlane) -> ComputeControlPlane {
|
||||
ComputeControlPlane {
|
||||
pg_bin_dir: PG_BIN_DIR.to_path_buf(),
|
||||
work_dir: TEST_WORKDIR.to_path_buf(),
|
||||
last_assigned_port: 65431,
|
||||
storage_cplane: storage_cplane,
|
||||
nodes: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: check port availability and
|
||||
fn get_port(&mut self) -> u16 {
|
||||
let port = self.last_assigned_port + 1;
|
||||
self.last_assigned_port += 1;
|
||||
port
|
||||
}
|
||||
|
||||
pub fn new_vanilla_node<'a>(&mut self) -> &Arc<PostgresNode> {
|
||||
// allocate new node entry with generated port
|
||||
let node_id = self.nodes.len() + 1;
|
||||
let node = PostgresNode {
|
||||
_node_id: node_id,
|
||||
port: self.get_port(),
|
||||
ip: IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)),
|
||||
pgdata: self.work_dir.join(format!("compute/pg{}", node_id)),
|
||||
pg_bin_dir: self.pg_bin_dir.clone(),
|
||||
};
|
||||
self.nodes.push(Arc::new(node));
|
||||
let node = self.nodes.last().unwrap();
|
||||
|
||||
// initialize data directory
|
||||
fs::remove_dir_all(node.pgdata.to_str().unwrap()).ok();
|
||||
let initdb_path = self.pg_bin_dir.join("initdb");
|
||||
println!("initdb_path: {}", initdb_path.to_str().unwrap());
|
||||
let initdb = Command::new(initdb_path)
|
||||
.args(&["-D", node.pgdata.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.success() {
|
||||
panic!("initdb failed");
|
||||
}
|
||||
|
||||
// // allow local replication connections
|
||||
// node.append_conf("pg_hba.conf", format!("\
|
||||
// host replication all {}/32 sspi include_realm=1 map=regress\n\
|
||||
// ", node.ip).as_str());
|
||||
|
||||
// listen for selected port
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
max_wal_senders = 10\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
max_connections = 100\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n\
|
||||
",
|
||||
address = node.ip,
|
||||
port = node.port
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
// Init compute node without files, only datadir structure
|
||||
// use initdb --compute-node flag and GUC 'computenode_mode'
|
||||
// to distinguish the node
|
||||
pub fn new_minimal_node(&mut self) -> &PostgresNode {
|
||||
// allocate new node entry with generated port
|
||||
let node_id = self.nodes.len() + 1;
|
||||
let node = PostgresNode {
|
||||
_node_id: node_id,
|
||||
port: self.get_port(),
|
||||
ip: IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)),
|
||||
pgdata: self.work_dir.join(format!("compute/pg{}", node_id)),
|
||||
pg_bin_dir: self.pg_bin_dir.clone(),
|
||||
};
|
||||
self.nodes.push(Arc::new(node));
|
||||
let node = self.nodes.last().unwrap();
|
||||
|
||||
// initialize data directory w/o files
|
||||
fs::remove_dir_all(node.pgdata.to_str().unwrap()).ok();
|
||||
let initdb_path = self.pg_bin_dir.join("initdb");
|
||||
println!("initdb_path: {}", initdb_path.to_str().unwrap());
|
||||
let initdb = Command::new(initdb_path)
|
||||
.args(&["-D", node.pgdata.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.arg("--no-instructions")
|
||||
.arg("--compute-node")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.success() {
|
||||
panic!("initdb failed");
|
||||
}
|
||||
|
||||
// listen for selected port
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
max_wal_senders = 10\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
max_connections = 100\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n\
|
||||
computenode_mode = true\n\
|
||||
",
|
||||
address = node.ip,
|
||||
port = node.port
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_node(&mut self) -> Arc<PostgresNode> {
|
||||
let storage_cplane = self.storage_cplane;
|
||||
let node = self.new_vanilla_node();
|
||||
|
||||
let pserver = storage_cplane.page_server_addr();
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
page_server_connstring = 'host={} port={}'\n\
|
||||
",
|
||||
pserver.ip(),
|
||||
pserver.port()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node.clone()
|
||||
}
|
||||
|
||||
pub fn new_master_node(&mut self) -> Arc<PostgresNode> {
|
||||
let node = self.new_vanilla_node();
|
||||
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
"synchronous_standby_names = 'safekeeper_proxy'\n\
|
||||
",
|
||||
);
|
||||
node.clone()
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct WalProposerNode {
|
||||
pid: u32,
|
||||
}
|
||||
|
||||
impl WalProposerNode {
|
||||
pub fn stop(&self) {
|
||||
let status = Command::new("kill")
|
||||
.arg(self.pid.to_string())
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
panic!("kill start failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalProposerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct PostgresNode {
|
||||
_node_id: usize,
|
||||
pub port: u16,
|
||||
pub ip: IpAddr,
|
||||
pgdata: PathBuf,
|
||||
pg_bin_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl PostgresNode {
|
||||
pub fn append_conf(&self, config: &str, opts: &str) {
|
||||
OpenOptions::new()
|
||||
.append(true)
|
||||
.open(self.pgdata.join(config).to_str().unwrap())
|
||||
.unwrap()
|
||||
.write_all(opts.as_bytes())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn pg_ctl(&self, args: &[&str], check_ok: bool) {
|
||||
let pg_ctl_path = self.pg_bin_dir.join("pg_ctl");
|
||||
let pg_ctl = Command::new(pg_ctl_path)
|
||||
.args(
|
||||
[
|
||||
&[
|
||||
"-D",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
"-l",
|
||||
self.pgdata.join("log").to_str().unwrap(),
|
||||
],
|
||||
args,
|
||||
]
|
||||
.concat(),
|
||||
)
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute pg_ctl");
|
||||
|
||||
if check_ok && !pg_ctl.success() {
|
||||
panic!("pg_ctl failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self, storage_cplane: &StorageControlPlane) {
|
||||
if storage_cplane.page_servers.len() != 0 {
|
||||
let _res =
|
||||
storage_cplane.page_server_psql(format!("callmemaybe {}", self.connstr()).as_str());
|
||||
}
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
self.pg_ctl(&["start"], true);
|
||||
}
|
||||
|
||||
pub fn restart(&self) {
|
||||
self.pg_ctl(&["restart"], true);
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"], true);
|
||||
}
|
||||
|
||||
pub fn connstr(&self) -> String {
|
||||
format!("host={} port={} user={}", self.ip, self.port, self.whoami())
|
||||
}
|
||||
|
||||
// XXX: cache that in control plane
|
||||
pub fn whoami(&self) -> String {
|
||||
let output = Command::new("whoami")
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
pub fn safe_psql(&self, db: &str, sql: &str) -> Vec<tokio_postgres::Row> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.ip,
|
||||
self.port,
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Running {}", sql);
|
||||
client.query(sql, &[]).unwrap()
|
||||
}
|
||||
|
||||
pub fn open_psql(&self, db: &str) -> Client {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.ip,
|
||||
self.port,
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls).unwrap()
|
||||
}
|
||||
|
||||
pub fn get_pgdata(&self) -> Option<&str> {
|
||||
self.pgdata.to_str()
|
||||
}
|
||||
|
||||
/* Create stub controlfile and respective xlog to start computenode */
|
||||
pub fn setup_controlfile(&self) {
|
||||
let filepath = format!("{}/global/pg_control", self.pgdata.to_str().unwrap());
|
||||
|
||||
{
|
||||
File::create(filepath).unwrap();
|
||||
}
|
||||
|
||||
let pg_resetwal_path = self.pg_bin_dir.join("pg_resetwal");
|
||||
|
||||
let pg_resetwal = Command::new(pg_resetwal_path)
|
||||
.args(&["-D", self.pgdata.to_str().unwrap()])
|
||||
.arg("-f")
|
||||
// TODO probably we will have to modify pg_resetwal
|
||||
// .arg("--compute-node")
|
||||
.status()
|
||||
.expect("failed to execute pg_resetwal");
|
||||
|
||||
if !pg_resetwal.success() {
|
||||
panic!("pg_resetwal failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_proxy(&self, wal_acceptors: String) -> WalProposerNode {
|
||||
let proxy_path = PG_BIN_DIR.join("safekeeper_proxy");
|
||||
match Command::new(proxy_path.as_path())
|
||||
.args(&["-s", &wal_acceptors])
|
||||
.args(&["-h", &self.ip.to_string()])
|
||||
.args(&["-p", &self.port.to_string()])
|
||||
.arg("-v")
|
||||
.stderr(File::create(TEST_WORKDIR.join("safepkeeper_proxy.log")).unwrap())
|
||||
.spawn()
|
||||
{
|
||||
Ok(child) => WalProposerNode { pid: child.id() },
|
||||
Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO
|
||||
pub fn pg_bench() {}
|
||||
pub fn pg_regress() {}
|
||||
}
|
||||
|
||||
impl Drop for PostgresNode {
|
||||
// destructor to clean up state after test is done
|
||||
// XXX: we may detect failed test by setting some flag in catch_unwind()
|
||||
// and checking it here. But let just clean datadirs on start.
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
// fs::remove_dir_all(self.pgdata.clone()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn regress_check(pg: &PostgresNode) {
|
||||
pg.safe_psql("postgres", "CREATE DATABASE regression");
|
||||
|
||||
let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress");
|
||||
fs::create_dir_all(regress_run_path.clone()).unwrap();
|
||||
std::env::set_current_dir(regress_run_path).unwrap();
|
||||
|
||||
let regress_build_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
|
||||
let regress_src_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
|
||||
|
||||
let _regress_check = Command::new(regress_build_path.join("pg_regress"))
|
||||
.args(&[
|
||||
"--bindir=''",
|
||||
"--use-existing",
|
||||
format!("--bindir={}", PG_BIN_DIR.to_str().unwrap()).as_str(),
|
||||
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
|
||||
format!(
|
||||
"--schedule={}",
|
||||
regress_src_path.join("parallel_schedule").to_str().unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.env("PGPORT", pg.port.to_string())
|
||||
.env("PGUSER", pg.whoami())
|
||||
.env("PGHOST", pg.ip.to_string())
|
||||
.status()
|
||||
.expect("pg_regress failed");
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
// test node resettlement to an empty datadir
|
||||
#[test]
|
||||
fn test_resettlement() {}
|
||||
|
||||
// test seq scan of everythin after restart
|
||||
#[test]
|
||||
fn test_cold_seqscan() {}
|
||||
@@ -1,5 +0,0 @@
|
||||
#[test]
|
||||
fn test_actions() {}
|
||||
|
||||
#[test]
|
||||
fn test_regress() {}
|
||||
@@ -1,110 +0,0 @@
|
||||
#[allow(dead_code)]
|
||||
mod control_plane;
|
||||
|
||||
use control_plane::ComputeControlPlane;
|
||||
use control_plane::StorageControlPlane;
|
||||
|
||||
// XXX: force all redo at the end
|
||||
// -- restart + seqscan won't read deleted stuff
|
||||
// -- pageserver api endpoint to check all rels
|
||||
|
||||
// Handcrafted cases with wal records that are (were) problematic for redo.
|
||||
#[test]
|
||||
fn test_redo_cases() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = StorageControlPlane::one_page_server();
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
// start postgres
|
||||
let node = compute_cplane.new_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
|
||||
// check 'create table as'
|
||||
node.safe_psql("postgres", "CREATE TABLE t2 AS SELECT * FROM t");
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
}
|
||||
|
||||
// Runs pg_regress on a compute node
|
||||
#[test]
|
||||
fn test_regress() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = StorageControlPlane::one_page_server();
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
// start postgres
|
||||
let node = compute_cplane.new_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
control_plane::regress_check(&node);
|
||||
}
|
||||
|
||||
// Run two postgres instances on one pageserver
|
||||
#[test]
|
||||
fn test_pageserver_multitenancy() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = StorageControlPlane::one_page_server();
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
// Allocate postgres instance, but don't start
|
||||
let node1 = compute_cplane.new_node();
|
||||
let node2 = compute_cplane.new_node();
|
||||
node1.start(&storage_cplane);
|
||||
node2.start(&storage_cplane);
|
||||
|
||||
// check node1
|
||||
node1.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node1.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
let count: i64 = node1
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
|
||||
// check node2
|
||||
node2.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node2.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(100000,200000), 'payload'",
|
||||
);
|
||||
let count: i64 = node2
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 15000150000);
|
||||
}
|
||||
@@ -1,222 +0,0 @@
|
||||
// Restart acceptors one by one while compute is under the load.
|
||||
#[allow(dead_code)]
|
||||
mod control_plane;
|
||||
use control_plane::ComputeControlPlane;
|
||||
use control_plane::StorageControlPlane;
|
||||
|
||||
use rand::Rng;
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use std::{thread, time};
|
||||
|
||||
#[test]
|
||||
fn test_acceptors_normal_work() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
// check wal files equality
|
||||
}
|
||||
|
||||
// Majority is always alive
|
||||
#[test]
|
||||
fn test_acceptors_restarts() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
const FAULT_PROBABILITY: f32 = 0.01;
|
||||
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
let mut failed_node: Option<usize> = None;
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
let mut psql = node.open_psql("postgres");
|
||||
for i in 1..=1000 {
|
||||
psql.execute("INSERT INTO t values ($1, 'payload')", &[&i])
|
||||
.unwrap();
|
||||
let prob: f32 = rng.gen();
|
||||
if prob <= FAULT_PROBABILITY {
|
||||
if let Some(node) = failed_node {
|
||||
storage_cplane.wal_acceptors[node].start();
|
||||
failed_node = None;
|
||||
} else {
|
||||
let node: usize = rng.gen_range(0..REDUNDANCY);
|
||||
failed_node = Some(node);
|
||||
storage_cplane.wal_acceptors[node].stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 500500);
|
||||
}
|
||||
|
||||
fn start_acceptor(cplane: &Arc<StorageControlPlane>, no: usize) {
|
||||
let cp = cplane.clone();
|
||||
thread::spawn(move || {
|
||||
thread::sleep(time::Duration::from_secs(1));
|
||||
cp.wal_acceptors[no].start();
|
||||
});
|
||||
}
|
||||
|
||||
// Stop majority of acceptors while compute is under the load. Boot
|
||||
// them again and check that nothing was losed. Repeat.
|
||||
// N_CRASHES env var
|
||||
#[test]
|
||||
fn test_acceptors_unavalability() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 2;
|
||||
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
let mut psql = node.open_psql("postgres");
|
||||
psql.execute("INSERT INTO t values (1, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
storage_cplane.wal_acceptors[0].stop();
|
||||
let cp = Arc::new(storage_cplane);
|
||||
start_acceptor(&cp, 0);
|
||||
let now = SystemTime::now();
|
||||
psql.execute("INSERT INTO t values (2, 'payload')", &[])
|
||||
.unwrap();
|
||||
assert!(now.elapsed().unwrap().as_secs() > 1);
|
||||
psql.execute("INSERT INTO t values (3, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
cp.wal_acceptors[1].stop();
|
||||
start_acceptor(&cp, 1);
|
||||
psql.execute("INSERT INTO t values (4, 'payload')", &[])
|
||||
.unwrap();
|
||||
assert!(now.elapsed().unwrap().as_secs() > 2);
|
||||
|
||||
psql.execute("INSERT INTO t values (5, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 15);
|
||||
}
|
||||
|
||||
fn simulate_failures(cplane: &Arc<StorageControlPlane>) {
|
||||
let mut rng = rand::thread_rng();
|
||||
let n_acceptors = cplane.wal_acceptors.len();
|
||||
let failure_period = time::Duration::from_secs(1);
|
||||
loop {
|
||||
thread::sleep(failure_period);
|
||||
let mask: u32 = rng.gen_range(0..(1 << n_acceptors));
|
||||
for i in 0..n_acceptors {
|
||||
if (mask & (1 << i)) != 0 {
|
||||
cplane.wal_acceptors[i].stop();
|
||||
}
|
||||
}
|
||||
thread::sleep(failure_period);
|
||||
for i in 0..n_acceptors {
|
||||
if (mask & (1 << i)) != 0 {
|
||||
cplane.wal_acceptors[i].start();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Race condition test
|
||||
#[test]
|
||||
fn test_race_conditions() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
let cplane = Arc::new(storage_cplane);
|
||||
let cp = cplane.clone();
|
||||
thread::spawn(move || {
|
||||
simulate_failures(&cp);
|
||||
});
|
||||
|
||||
let mut psql = node.open_psql("postgres");
|
||||
for i in 1..=1000 {
|
||||
psql.execute("INSERT INTO t values ($1, 'payload')", &[&i])
|
||||
.unwrap();
|
||||
}
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 500500);
|
||||
cplane.stop();
|
||||
}
|
||||
23
mgmt-console/.gitignore
vendored
23
mgmt-console/.gitignore
vendored
@@ -1,23 +0,0 @@
|
||||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||
|
||||
# dependencies
|
||||
/node_modules
|
||||
/.pnp
|
||||
.pnp.js
|
||||
|
||||
# testing
|
||||
/coverage
|
||||
|
||||
# production
|
||||
/build
|
||||
|
||||
# misc
|
||||
.DS_Store
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
@@ -1,55 +0,0 @@
|
||||
Mock implementation of a management console.
|
||||
|
||||
See demo-howto.txt for usage.
|
||||
|
||||
Building and Installation
|
||||
-------------------------
|
||||
|
||||
To compile Postgres:
|
||||
sudo apt build-dep postgresql
|
||||
sudo apt install bison flex libz-dev libssl-dev
|
||||
sudo apt install ccache
|
||||
sudo apt install libcurl4-openssl-dev libxml2-dev
|
||||
|
||||
For the webapp:
|
||||
# NOTE: This requires at least version 1.1.0 of python3-flask. That's not
|
||||
# available in Debian Buster, need at least Bullseye.
|
||||
|
||||
sudo apt install python3 python3-flask python3-pip npm webpack
|
||||
pip3 install Flask-BasicAuth
|
||||
pip3 install boto3
|
||||
|
||||
git clone and compile and install patched version of Postgres:
|
||||
|
||||
git clone https://github.com/libzenith/postgres.git
|
||||
cd postgres
|
||||
git checkout zenith-experiments
|
||||
./configure --enable-debug --enable-cassert --with-openssl --prefix=/home/heikki/pgsql-install --with-libxml CC="ccache gcc" CFLAGS="-O0"
|
||||
make -j4 -s install
|
||||
|
||||
Get the webapp:
|
||||
cd ~
|
||||
git clone https://github.com/libzenith/zenith-mgmt-console.git
|
||||
cd zenith-mgmt-console
|
||||
mkdir pgdatadirs
|
||||
|
||||
|
||||
openssl req -new -x509 -days 365 -nodes -text -out server.crt \
|
||||
-keyout server.key -subj "/CN=zenith-demo"
|
||||
|
||||
For Mock S3 server (unless you want to test against a real cloud service):
|
||||
sudo apt install python3-tornado
|
||||
|
||||
cd ~/zenith-mgmt-console
|
||||
git clone https://github.com/hlinnaka/ms3.git
|
||||
|
||||
Compile & run it:
|
||||
npm install
|
||||
webpack # compile React app
|
||||
|
||||
BASIC_AUTH_PASSWORD=<password> ./launch-local.sh
|
||||
|
||||
|
||||
You can view the contents of the S3 bucket with browser:
|
||||
|
||||
http://<server>/list_bucket
|
||||
@@ -1,340 +0,0 @@
|
||||
from flask import request
|
||||
from flask_basicauth import BasicAuth
|
||||
from flask import render_template
|
||||
from subprocess import PIPE, STDOUT, run, Popen
|
||||
import html
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import logging
|
||||
import time
|
||||
|
||||
import boto3
|
||||
from boto3.session import Session
|
||||
from botocore.client import Config
|
||||
from botocore.handlers import set_list_objects_encoding_type_url
|
||||
|
||||
from flask import Flask
|
||||
|
||||
import waldump
|
||||
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
app.config['BASIC_AUTH_USERNAME'] = 'zenith'
|
||||
app.config['BASIC_AUTH_PASSWORD'] = os.getenv('BASIC_AUTH_PASSWORD')
|
||||
app.config['BASIC_AUTH_FORCE'] = True
|
||||
|
||||
basic_auth = BasicAuth(app)
|
||||
|
||||
# S3 configuration:
|
||||
|
||||
ENDPOINT = os.getenv('S3_ENDPOINT', 'https://localhost:9000')
|
||||
ACCESS_KEY = os.getenv('S3_ACCESSKEY', 'minioadmin')
|
||||
SECRET = os.getenv('S3_SECRET', '')
|
||||
BUCKET = os.getenv('S3_BUCKET', 'foobucket')
|
||||
|
||||
print("Using bucket at " + ENDPOINT);
|
||||
|
||||
#boto3.set_stream_logger('botocore', logging.DEBUG)
|
||||
|
||||
session = Session(aws_access_key_id=ACCESS_KEY,
|
||||
aws_secret_access_key=SECRET,
|
||||
region_name=os.getenv('S3_REGION', 'auto'))
|
||||
|
||||
# needed for google cloud?
|
||||
session.events.unregister('before-parameter-build.s3.ListObjects',
|
||||
set_list_objects_encoding_type_url)
|
||||
|
||||
s3resource = session.resource('s3',
|
||||
endpoint_url=ENDPOINT,
|
||||
verify=False,
|
||||
config=Config(signature_version='s3v4'))
|
||||
s3bucket = s3resource.Bucket(BUCKET)
|
||||
|
||||
s3_client = boto3.client('s3',
|
||||
endpoint_url=ENDPOINT,
|
||||
verify=False,
|
||||
config=Config(signature_version='s3v4'),
|
||||
aws_access_key_id=ACCESS_KEY,
|
||||
aws_secret_access_key=SECRET)
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
return render_template("index.html")
|
||||
|
||||
|
||||
@app.route("/api/waldump")
|
||||
def render_waldump():
|
||||
return render_template("waldump.html")
|
||||
|
||||
@app.route('/api/fetch_wal')
|
||||
def fetch_wal():
|
||||
return waldump.fetch_wal(request, s3bucket);
|
||||
|
||||
@app.route("/api/server_status")
|
||||
def server_status():
|
||||
dirs = os.listdir("pgdatadirs")
|
||||
dirs.sort()
|
||||
|
||||
primary = None
|
||||
standbys = []
|
||||
|
||||
for dirname in dirs:
|
||||
|
||||
result = run("pg_ctl status -D pgdatadirs/" + dirname, stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
|
||||
srv = {
|
||||
'datadir': dirname,
|
||||
'status': result.stdout,
|
||||
'port': None
|
||||
}
|
||||
|
||||
if dirname == 'primary':
|
||||
primary = srv;
|
||||
primary['port'] = 5432;
|
||||
else:
|
||||
standby_match = re.search('standby_([0-9]+)', dirname)
|
||||
if standby_match:
|
||||
srv['port'] = int(standby_match.group(1))
|
||||
|
||||
standbys.append(srv);
|
||||
|
||||
return {'primary': primary, 'standbys': standbys}
|
||||
|
||||
@app.route('/api/list_bucket')
|
||||
def list_bucket():
|
||||
|
||||
response = 'cloud bucket contents:<br>\n'
|
||||
|
||||
for file in s3bucket.objects.all():
|
||||
response = response + html.escape(file.key) + '<br>\n'
|
||||
|
||||
return response
|
||||
|
||||
def walpos_str(walpos):
|
||||
return '{:X}/{:X}'.format(walpos >> 32, walpos & 0xFFFFFFFF)
|
||||
|
||||
@app.route('/api/bucket_summary')
|
||||
def bucket_summary():
|
||||
|
||||
nonrelimages = []
|
||||
minwal = int(0)
|
||||
maxwal = int(0)
|
||||
minseqwal = int(0)
|
||||
maxseqwal = int(0)
|
||||
|
||||
for file in s3bucket.objects.all():
|
||||
path = file.key
|
||||
match = re.search('nonreldata/nonrel_([0-9A-F]+).tar', path)
|
||||
if match:
|
||||
walpos = int(match.group(1), 16)
|
||||
nonrelimages.append(walpos_str(walpos))
|
||||
|
||||
match = re.search('nonreldata/nonrel_([0-9A-F]+)-([0-9A-F]+)', path)
|
||||
if match:
|
||||
endwal = int(match.group(2), 16)
|
||||
if endwal > maxwal:
|
||||
maxwal = endwal
|
||||
|
||||
match = re.search('walarchive/([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', path)
|
||||
if match:
|
||||
tli = int(match.group(1), 16)
|
||||
logno = int(match.group(2), 16)
|
||||
segno = int(match.group(3), 16)
|
||||
# FIXME: this assumes default 16 MB wal segment size
|
||||
logsegno = logno * (0x100000000 / (16*1024*1024)) + segno
|
||||
|
||||
seqwal = int((logsegno + 1) * (16*1024*1024))
|
||||
|
||||
if seqwal > maxseqwal:
|
||||
maxseqwal = seqwal;
|
||||
if minseqwal == 0 or seqwal < minseqwal:
|
||||
minseqwal = seqwal;
|
||||
|
||||
return {
|
||||
'nonrelimages': nonrelimages,
|
||||
'minwal': walpos_str(minwal),
|
||||
'maxwal': walpos_str(maxwal),
|
||||
'minseqwal': walpos_str(minseqwal),
|
||||
'maxseqwal': walpos_str(maxseqwal)
|
||||
}
|
||||
|
||||
def print_cmd_result(cmd_result):
|
||||
return print_cmd_result_ex(cmd_result.args, cmd_result.returncode, cmd_result.stdout)
|
||||
|
||||
def print_cmd_result_ex(cmd, returncode, stdout):
|
||||
res = ''
|
||||
res += 'ran command:\n' + str(cmd) + '\n'
|
||||
res += 'It returned code ' + str(returncode) + '\n'
|
||||
res += '\n'
|
||||
res += 'stdout/stderr:\n'
|
||||
res += stdout
|
||||
|
||||
return res
|
||||
|
||||
@app.route('/api/init_primary', methods=['GET', 'POST'])
|
||||
def init_primary():
|
||||
|
||||
initdb_result = run("initdb -D pgdatadirs/primary --username=zenith --pwfile=pg-password.txt", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
if initdb_result.returncode != 0:
|
||||
return print_cmd_result(initdb_result)
|
||||
|
||||
# Append archive_mode and archive_command and port to postgresql.conf
|
||||
f=open("pgdatadirs/primary/postgresql.conf", "a+")
|
||||
f.write("listen_addresses='*'\n")
|
||||
f.write("archive_mode=on\n")
|
||||
f.write("archive_command='zenith_push --archive-wal-path=%p --archive-wal-fname=%f'\n")
|
||||
f.write("ssl=on\n")
|
||||
f.close()
|
||||
|
||||
f=open("pgdatadirs/primary/pg_hba.conf", "a+")
|
||||
f.write("# allow SSL connections with password from anywhere\n")
|
||||
f.write("hostssl all all 0.0.0.0/0 md5\n")
|
||||
f.write("hostssl all all ::0/0 md5\n")
|
||||
f.close()
|
||||
|
||||
shutil.copyfile("server.crt", "pgdatadirs/primary/server.crt")
|
||||
shutil.copyfile("server.key", "pgdatadirs/primary/server.key")
|
||||
os.chmod("pgdatadirs/primary/server.key", 0o0600)
|
||||
|
||||
start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
start_rc = start_proc.wait()
|
||||
start_stdout, start_stderr = start_proc.communicate()
|
||||
|
||||
responsestr = print_cmd_result(initdb_result) + '\n'
|
||||
responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/zenith_push', methods=['GET', 'POST'])
|
||||
def zenith_push():
|
||||
# Stop the primary if it's running
|
||||
stop_result = run(args=["pg_ctl", "stop", "-D", "pgdatadirs/primary"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
|
||||
# Call zenith_push
|
||||
push_result = run("zenith_push -D pgdatadirs/primary", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
|
||||
# Restart the primary
|
||||
start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
start_rc = start_proc.wait()
|
||||
start_stdout, start_stderr = start_proc.communicate()
|
||||
|
||||
responsestr = print_cmd_result(stop_result) + '\n'
|
||||
responsestr += print_cmd_result(push_result) + '\n'
|
||||
responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout) + '\n'
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/create_standby', methods=['GET', 'POST'])
|
||||
def create_standby():
|
||||
|
||||
walpos = request.form.get('walpos')
|
||||
if not walpos:
|
||||
return 'no walpos'
|
||||
|
||||
dirs = os.listdir("pgdatadirs")
|
||||
|
||||
last_port = 5432
|
||||
|
||||
for dirname in dirs:
|
||||
|
||||
standby_match = re.search('standby_([0-9]+)', dirname)
|
||||
if standby_match:
|
||||
port = int(standby_match.group(1))
|
||||
if port > last_port:
|
||||
last_port = port
|
||||
|
||||
standby_port = last_port + 1
|
||||
|
||||
standby_dir = "pgdatadirs/standby_" + str(standby_port)
|
||||
|
||||
# Call zenith_restore
|
||||
restore_result = run(["zenith_restore", "--end=" + walpos, "-D", standby_dir], stdout=PIPE, stderr=STDOUT, encoding='latin1')
|
||||
responsestr = print_cmd_result(restore_result)
|
||||
|
||||
if restore_result.returncode == 0:
|
||||
# Append hot_standby and port to postgresql.conf
|
||||
f=open(standby_dir + "/postgresql.conf", "a+")
|
||||
f.write("hot_standby=on\n")
|
||||
f.write("port=" + str(standby_port) + "\n")
|
||||
f.close()
|
||||
|
||||
start_proc = Popen(args=["pg_ctl", "start", "-D", standby_dir, "-l", standby_dir + "/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
start_rc = start_proc.wait()
|
||||
start_stdout, start_stderr = start_proc.communicate()
|
||||
responsestr += '\n\n' + print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/destroy_server', methods=['GET', 'POST'])
|
||||
def destroy_primary():
|
||||
|
||||
datadir = request.form.get('datadir')
|
||||
|
||||
# Check that the datadir parameter doesn't contain anything funny.
|
||||
if not re.match("^[A-Za-z0-9_-]+$", datadir):
|
||||
raise Exception('invalid datadir: ' + datadir)
|
||||
|
||||
# Stop the server if it's running
|
||||
stop_result = run(args=["pg_ctl", "stop", "-m", "immediate", "-D", "pgdatadirs/" + datadir], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
|
||||
shutil.rmtree('pgdatadirs/' + datadir, ignore_errors=True)
|
||||
|
||||
responsestr = print_cmd_result(stop_result) + '\n'
|
||||
responsestr += 'Deleted datadir ' + datadir + '.\n'
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/restore_primary', methods=['GET', 'POST'])
|
||||
def restore_primary():
|
||||
|
||||
# Call zenith_restore
|
||||
restore_result = run(["zenith_restore", "-D", "pgdatadirs/primary"], stdout=PIPE, stderr=STDOUT, encoding='latin1')
|
||||
responsestr = print_cmd_result(restore_result)
|
||||
|
||||
# Append restore_command to postgresql.conf, so that it can find the last raw WAL segments
|
||||
f=open("pgdatadirs/primary/postgresql.conf", "a+")
|
||||
f.write("listen_addresses='*'\n")
|
||||
f.write("restore_command='zenith_restore --archive-wal-path=%p --archive-wal-fname=%f'\n")
|
||||
f.write("ssl=on\n")
|
||||
f.close()
|
||||
|
||||
if restore_result.returncode == 0:
|
||||
start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
start_rc = start_proc.wait()
|
||||
start_stdout, start_stderr = start_proc.communicate()
|
||||
responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/slicedice', methods=['GET', 'POST'])
|
||||
def run_slicedice():
|
||||
result = run("zenith_slicedice", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
|
||||
responsestr = print_cmd_result(result)
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/reset_demo', methods=['POST'])
|
||||
def reset_all():
|
||||
result = run("pkill -9 postgres", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
|
||||
dirs = os.listdir("pgdatadirs")
|
||||
for dirname in dirs:
|
||||
shutil.rmtree('pgdatadirs/' + dirname)
|
||||
|
||||
for file in s3bucket.objects.all():
|
||||
s3_client.delete_object(Bucket = BUCKET, Key = file.key)
|
||||
|
||||
responsestr = print_cmd_result(result) + '\n'
|
||||
responsestr += '''
|
||||
Deleted all Postgres datadirs.
|
||||
Deleted all files in object storage bucket.
|
||||
'''
|
||||
|
||||
return responsestr
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run()
|
||||
@@ -1,3 +0,0 @@
|
||||
module.exports = {
|
||||
presets: ["@babel/preset-env", "@babel/preset-react"],
|
||||
};
|
||||
@@ -1,67 +0,0 @@
|
||||
Mock implementation of a management console.
|
||||
|
||||
This isn't very different from a "normal" PostgreSQL installation with
|
||||
a base backup and WAL archive. The main user-visible difference is
|
||||
that when you create a standby server, we don't restore the whole data
|
||||
directory, but only the "non-relation" files. Relation files are
|
||||
restored on demand, when they're accessed the first time. That makes
|
||||
the "create standby" operation is very fast, but with some delay when
|
||||
you connect and start running queries instead. Most visible if you
|
||||
have a large database. (However, see note below about large databases)
|
||||
|
||||
Note: lots of things are broken/unsafe. Things will fail if a table is
|
||||
larger than 1 GB. Or if there are more than 1000 files in the cloud
|
||||
bucket.
|
||||
|
||||
How to use this demo:
|
||||
|
||||
1. If there are any leftovers from previous runs, reset by clicking
|
||||
the RESET DEMO button. This kills and deletes all Postgres servers,
|
||||
and empties the cloud storage bucket
|
||||
|
||||
2. Create primary server by clicking on the "Init primary" button
|
||||
|
||||
3. Push a base image of the primary to cloud storage, by clicking the
|
||||
"push base image" button. (This takes about 30 seconds, be
|
||||
patient)
|
||||
|
||||
4. Connect to primary with psql, and create a test table with a little data.
|
||||
|
||||
psql postgres -p5432 -U zenith -h<host>
|
||||
|
||||
create table mytable (i int4);
|
||||
|
||||
insert into mytable values (1);
|
||||
select pg_switch_wal();
|
||||
|
||||
The Postgres password is the same as for the management console.
|
||||
|
||||
3. Now that there's a new WAL segment in the arhive, we can "slice &
|
||||
dice" it. Click on the "Slice & dice button".
|
||||
|
||||
4. Perform more updates on the primary, to generate more WAL.
|
||||
|
||||
insert into mytable values (2); select pg_switch_wal();
|
||||
insert into mytable values (3); select pg_switch_wal();
|
||||
insert into mytable values (4); select pg_switch_wal();
|
||||
insert into mytable values (5); select pg_switch_wal();
|
||||
|
||||
5. Slice & Dice the WAL again
|
||||
|
||||
6. Now you can create read-only standby servers at any point in the
|
||||
WAL. Type a WAL position in the text box (or use the slider), and
|
||||
click "Create new standby". The first standby is created at port 5433,
|
||||
the second at port 5434, and so forth.
|
||||
|
||||
7. Connect to the standby with "psql -p 5433". Note that it takes a
|
||||
few seconds until the connection is established. That's because the
|
||||
standby has to restore the basic system catalogs, like pg_database and
|
||||
pg_authid from the backup. After connecting, you can do "\d" to list
|
||||
tables, this will also take a few seconds, as more catalog tables are
|
||||
restored from backup. Subsequent commands will be faster.
|
||||
|
||||
Run queries in the standby:
|
||||
|
||||
select * from mytable;
|
||||
|
||||
the result depends on the LSN that you picked when you created the server.
|
||||
@@ -1,463 +0,0 @@
|
||||
import React, { useState, useEffect } from 'react';
|
||||
import ReactDOM from 'react-dom';
|
||||
import Loader from "react-loader-spinner";
|
||||
import { Router, Route, Link, IndexRoute, hashHistory, browserHistory } from 'react-router';
|
||||
|
||||
function ServerStatus(props) {
|
||||
const datadir = props.server.datadir;
|
||||
const status = props.server.status;
|
||||
const port = props.server.port;
|
||||
|
||||
return (
|
||||
<div>
|
||||
<h2>{ datadir == 'primary' ? 'Primary' : datadir }</h2>
|
||||
status: <div className='status'>{status}</div><br/>
|
||||
to connect: <span className='shellcommand'>psql -h { window.location.hostname } -p { port } -U zenith postgres</span><br/>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function StandbyList(props) {
|
||||
const bucketSummary = props.bucketSummary;
|
||||
const standbys = props.standbys;
|
||||
const maxwalpos = bucketSummary.maxwal ? walpos_to_int(bucketSummary.maxwal) : 0;
|
||||
|
||||
const [walposInput, setWalposInput] = useState({ src: 'text', value: '0/0'});
|
||||
|
||||
// find earliest base image
|
||||
const minwalpos = bucketSummary.nonrelimages ? bucketSummary.nonrelimages.reduce((minpos, imgpos_str, index, array) => {
|
||||
const imgpos = walpos_to_int(imgpos_str);
|
||||
return (minpos == 0 || imgpos < minpos) ? imgpos : minpos;
|
||||
}, 0) : 0;
|
||||
|
||||
const can_create_standby = minwalpos > 0 && maxwalpos > 0 && maxwalpos >= minwalpos;
|
||||
var walpos_valid = true;
|
||||
|
||||
function create_standby() {
|
||||
const formdata = new FormData();
|
||||
formdata.append("walpos", walposStr);
|
||||
|
||||
props.startOperation('Creating new standby at ' + walposStr + '...',
|
||||
fetch("/api/create_standby", { method: 'POST', body: formdata }));
|
||||
}
|
||||
|
||||
function destroy_standby(datadir) {
|
||||
const formdata = new FormData();
|
||||
formdata.append("datadir", datadir);
|
||||
props.startOperation('Destroying ' + datadir + '...',
|
||||
fetch("/api/destroy_server", { method: 'POST', body: formdata }));
|
||||
}
|
||||
|
||||
const handleSliderChange = (event) => {
|
||||
setWalposInput({ src: 'slider', value: event.target.value });
|
||||
}
|
||||
|
||||
const handleWalposChange = (event) => {
|
||||
setWalposInput({ src: 'text', value: event.target.value });
|
||||
}
|
||||
|
||||
var sliderValue;
|
||||
var walposStr;
|
||||
if (walposInput.src == 'text')
|
||||
{
|
||||
const walpos = walpos_to_int(walposInput.value);
|
||||
|
||||
if (walpos >= minwalpos && walpos <= maxwalpos)
|
||||
walpos_valid = true;
|
||||
else
|
||||
walpos_valid = false;
|
||||
|
||||
sliderValue = Math.round((walpos - minwalpos) / (maxwalpos - minwalpos) * 100);
|
||||
walposStr = walposInput.value;
|
||||
}
|
||||
else
|
||||
{
|
||||
const slider = walposInput.value;
|
||||
const new_walpos = minwalpos + slider / 100 * (maxwalpos - minwalpos);
|
||||
|
||||
console.log('minwalpos: '+ minwalpos);
|
||||
console.log('maxwalpos: '+ maxwalpos);
|
||||
|
||||
walposStr = int_to_walpos(Math.round(new_walpos));
|
||||
walpos_valid = true;
|
||||
console.log(walposStr);
|
||||
}
|
||||
|
||||
var standbystatus = ''
|
||||
if (standbys)
|
||||
{
|
||||
standbystatus =
|
||||
<div>
|
||||
{
|
||||
standbys.length > 0 ?
|
||||
standbys.map((server) =>
|
||||
<>
|
||||
<ServerStatus key={ 'status_' + server.datadir} server={server}/>
|
||||
<button key={ 'destroy_' + server.datadir} onClick={e => destroy_standby(server.datadir)}>Destroy standby</button>
|
||||
</>
|
||||
) : "no standby servers"
|
||||
}
|
||||
</div>
|
||||
}
|
||||
|
||||
return (
|
||||
<div>
|
||||
<h2>Standbys</h2>
|
||||
<button onClick={create_standby} disabled={!can_create_standby || !walpos_valid}>Create new Standby</button> at LSN
|
||||
<input type="text" id="walpos_input" value={ walposStr } onChange={handleWalposChange} disabled={!can_create_standby}/>
|
||||
<input type="range" id="walpos_slider" min="0" max="100" steps="1" value={sliderValue} onChange={handleSliderChange} disabled={!can_create_standby}/>
|
||||
<br/>
|
||||
{ standbystatus }
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ServerList(props) {
|
||||
const primary = props.serverStatus ? props.serverStatus.primary : null;
|
||||
const standbys = props.serverStatus ? props.serverStatus.standbys : [];
|
||||
const bucketSummary = props.bucketSummary;
|
||||
|
||||
var primarystatus = '';
|
||||
|
||||
function destroy_primary() {
|
||||
const formdata = new FormData();
|
||||
formdata.append("datadir", 'primary');
|
||||
props.startOperation('Destroying primary...',
|
||||
fetch("/api/destroy_server", { method: 'POST', body: formdata }));
|
||||
}
|
||||
|
||||
function restore_primary() {
|
||||
props.startOperation('Restoring primary...',
|
||||
fetch("/api/restore_primary", { method: 'POST' }));
|
||||
}
|
||||
|
||||
if (primary)
|
||||
{
|
||||
primarystatus =
|
||||
<div>
|
||||
<ServerStatus server={primary}/>
|
||||
<button onClick={destroy_primary}>Destroy primary</button>
|
||||
</div>
|
||||
}
|
||||
else
|
||||
{
|
||||
primarystatus =
|
||||
<div>
|
||||
no primary server<br/>
|
||||
<button onClick={restore_primary}>Restore primary</button>
|
||||
</div>
|
||||
}
|
||||
|
||||
return (
|
||||
<>
|
||||
{ primarystatus }
|
||||
<StandbyList standbys={standbys} startOperation={props.startOperation} bucketSummary={props.bucketSummary}/>
|
||||
<p className="todo">
|
||||
Should we list the WAL safekeeper nodes here? Or are they part of the Storage? Or not visible to users at all?
|
||||
</p>
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
function BucketSummary(props) {
|
||||
const bucketSummary = props.bucketSummary;
|
||||
const startOperation = props.startOperation;
|
||||
|
||||
function slicedice() {
|
||||
startOperation('Slicing sequential WAL to per-relation WAL...',
|
||||
fetch("/api/slicedice", { method: 'POST' }));
|
||||
}
|
||||
|
||||
if (!bucketSummary.nonrelimages)
|
||||
{
|
||||
return <>loading...</>
|
||||
}
|
||||
|
||||
return (
|
||||
<div>
|
||||
<div>Base images at following WAL positions:
|
||||
<ul>
|
||||
{bucketSummary.nonrelimages.map((img) => (
|
||||
<li key={img}>{img}</li>
|
||||
))}
|
||||
</ul>
|
||||
</div>
|
||||
Sliced WAL is available up to { bucketSummary.maxwal }<br/>
|
||||
Raw WAL is available up to { bucketSummary.maxseqwal }<br/>
|
||||
|
||||
<br/>
|
||||
<button onClick={slicedice}>Slice & Dice WAL</button>
|
||||
<p className="todo">
|
||||
Currently, the slicing or "sharding" of the WAL needs to be triggered manually, by clicking the above button.
|
||||
<br/>
|
||||
TODO: make it a continuous process that runs in the WAL safekeepers, or in the Page Servers, or as a standalone service.
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ProgressIndicator()
|
||||
{
|
||||
return (
|
||||
<div>
|
||||
<Loader
|
||||
type="Puff"
|
||||
color="#00BFFF"
|
||||
height={100}
|
||||
width={100}
|
||||
/>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function walpos_to_int(walpos)
|
||||
{
|
||||
const [hi, lo] = walpos.split('/');
|
||||
|
||||
return parseInt(hi, 16) + parseInt(lo, 16);
|
||||
}
|
||||
|
||||
function int_to_walpos(x)
|
||||
{
|
||||
console.log('converting ' + x);
|
||||
return (Math.floor((x / 0x100000000)).toString(16) + '/' + (x % 0x100000000).toString(16)).toUpperCase();
|
||||
}
|
||||
|
||||
function OperationStatus(props) {
|
||||
const lastOperation = props.lastOperation;
|
||||
const inProgress = props.inProgress;
|
||||
const operationResult = props.operationResult;
|
||||
|
||||
if (lastOperation)
|
||||
{
|
||||
return (
|
||||
<div><h2>Last operation:</h2>
|
||||
<div>{lastOperation} { (!inProgress && lastOperation) ? 'done!' : '' }</div>
|
||||
<div className='result'>
|
||||
{inProgress ? <ProgressIndicator/> : <pre>{operationResult}</pre>}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
else
|
||||
return '';
|
||||
}
|
||||
|
||||
function ActionButtons(props) {
|
||||
|
||||
const startOperation = props.startOperation;
|
||||
const bucketSummary = props.bucketSummary;
|
||||
|
||||
function reset_demo() {
|
||||
startOperation('resetting everything...',
|
||||
fetch("/api/reset_demo", { method: 'POST' }));
|
||||
}
|
||||
|
||||
function init_primary() {
|
||||
startOperation('Initializing new primary...',
|
||||
fetch("/api/init_primary", { method: 'POST' }));
|
||||
}
|
||||
|
||||
function zenith_push() {
|
||||
startOperation('Pushing new base image...',
|
||||
fetch("/api/zenith_push", { method: 'POST' }));
|
||||
}
|
||||
|
||||
return (
|
||||
<div>
|
||||
<p className="todo">
|
||||
RESET DEMO deletes everything in the storage bucket, and stops and destroys all servers. This resets the whole demo environment to the initial state.
|
||||
</p>
|
||||
<button onClick={reset_demo}>RESET DEMO</button>
|
||||
<p className="todo">
|
||||
Init Primary runs initdb to create a new primary server. Click this after Resetting the demo.
|
||||
</p>
|
||||
|
||||
<button onClick={init_primary}>Init primary</button>
|
||||
|
||||
<p className="todo">
|
||||
Push Base Image stops the primary, copies the current state of the primary to the storage bucket as a new base backup, and restarts the primary.
|
||||
<br/>
|
||||
TODO: This should be handled by a continuous background process, probably running in the storage nodes. And without having to shut down the cluster, of course.
|
||||
</p>
|
||||
|
||||
<button onClick={zenith_push}>Push base image</button>
|
||||
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function Sidenav(props)
|
||||
{
|
||||
const toPage = (page) => (event) => {
|
||||
//event.preventDefault()
|
||||
props.switchPage(page);
|
||||
};
|
||||
return (
|
||||
<div>
|
||||
<h3 className="sidenav-item">Menu</h3>
|
||||
<a href="#servers" onClick={toPage('servers')} className="sidenav-item">Servers</a>
|
||||
<a href="#storage" onClick={toPage('storage')} className="sidenav-item">Storage</a>
|
||||
<a href="#snapshots" onClick={toPage('snapshots')} className="sidenav-item">Snapshots</a>
|
||||
<a href="#demo" onClick={toPage('demo')} className="sidenav-item">Demo</a>
|
||||
<a href="#import" onClick={toPage('import')} className="sidenav-item">Import / Export</a>
|
||||
<a href="#jobs" onClick={toPage('jobs')} className="sidenav-item">Jobs</a>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function App()
|
||||
{
|
||||
const [page, setPage] = useState('servers');
|
||||
const [serverStatus, setServerStatus] = useState({});
|
||||
const [bucketSummary, setBucketSummary] = useState({});
|
||||
const [lastOperation, setLastOperation] = useState('');
|
||||
const [inProgress, setInProgress] = useState('');
|
||||
const [operationResult, setOperationResult] = useState('');
|
||||
|
||||
useEffect(() => {
|
||||
reloadStatus();
|
||||
}, []);
|
||||
|
||||
function startOperation(operation, promise)
|
||||
{
|
||||
promise.then(result => result.text()).then(resultText => {
|
||||
operationFinished(resultText);
|
||||
});
|
||||
|
||||
setLastOperation(operation);
|
||||
setInProgress(true);
|
||||
setOperationResult('');
|
||||
}
|
||||
|
||||
function operationFinished(result)
|
||||
{
|
||||
setInProgress(false);
|
||||
setOperationResult(result);
|
||||
reloadStatus();
|
||||
}
|
||||
|
||||
function clearOperation()
|
||||
{
|
||||
setLastOperation('')
|
||||
setInProgress('');
|
||||
setOperationResult('');
|
||||
console.log("cleared");
|
||||
}
|
||||
|
||||
function reloadStatus()
|
||||
{
|
||||
fetch('/api/server_status').then(res => res.json()).then(data => {
|
||||
setServerStatus(data);
|
||||
});
|
||||
|
||||
fetch('/api/bucket_summary').then(res => res.json()).then(data => {
|
||||
setBucketSummary(data);
|
||||
});
|
||||
}
|
||||
|
||||
const content = () => {
|
||||
console.log(page);
|
||||
if (page === 'servers') {
|
||||
return (
|
||||
<>
|
||||
<h1>Server status</h1>
|
||||
<ServerList startOperation={ startOperation }
|
||||
serverStatus={ serverStatus }
|
||||
bucketSummary={ bucketSummary }/>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'storage') {
|
||||
return (
|
||||
<>
|
||||
<h1>Storage bucket status</h1>
|
||||
<BucketSummary startOperation={ startOperation }
|
||||
bucketSummary={ bucketSummary }/>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'snapshots') {
|
||||
return (
|
||||
<>
|
||||
<h1>Snapshots</h1>
|
||||
<p className="todo">
|
||||
In Zenith, snapshots are just specific points (LSNs) in the WAL history, with a label. A snapshot prevents garbage collecting old data that's still needed to reconstruct the database at that LSN.
|
||||
</p>
|
||||
<p className="todo">
|
||||
TODO:
|
||||
<ul>
|
||||
<li>List existing snapshots</li>
|
||||
<li>Create new snapshot manually, from current state or from a given LSN</li>
|
||||
<li>Drill into the WAL stream to see what have happened. Provide tools for e.g. finding point where a table was dropped</li>
|
||||
<li>Create snapshots automatically based on events in the WAL, like if you call pg_create_restore_point(() in the primary</li>
|
||||
<li>Launch new reader instance at a snapshot</li>
|
||||
<li>Export snapshot</li>
|
||||
<li>Rollback cluster to a snapshot</li>
|
||||
</ul>
|
||||
</p>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'demo') {
|
||||
return (
|
||||
<>
|
||||
<h1>Misc actions</h1>
|
||||
<ActionButtons startOperation={ startOperation }
|
||||
bucketSummary={ bucketSummary }/>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'import') {
|
||||
return (
|
||||
<>
|
||||
<h1>Import & Export tools</h1>
|
||||
<p className="TODO">TODO:
|
||||
<ul>
|
||||
<li>Initialize database from existing backup (pg_basebackup, WAL-G, pgbackrest)</li>
|
||||
<li>Initialize from a pg_dump or other SQL script</li>
|
||||
<li>Launch batch job to import data files from S3</li>
|
||||
<li>Launch batch job to export database with pg_dump to S3</li>
|
||||
</ul>
|
||||
These jobs can be run in against reader processing nodes. We can even
|
||||
spawn a new reader node dedicated to a job, and destry it when the job is done.
|
||||
</p>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'jobs') {
|
||||
return (
|
||||
<>
|
||||
<h1>Batch jobs</h1>
|
||||
<p className="TODO">TODO:
|
||||
<ul>
|
||||
<li>List running jobs launched from Import & Export tools</li>
|
||||
<li>List other batch jobs launched by the user</li>
|
||||
<li>Launch new batch jobs</li>
|
||||
</ul>
|
||||
</p>
|
||||
</>
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function switchPage(page)
|
||||
{
|
||||
console.log("topage " + page);
|
||||
setPage(page)
|
||||
clearOperation();
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="row">
|
||||
<div className="sidenav">
|
||||
<Sidenav switchPage={switchPage} className="column"/>
|
||||
</div>
|
||||
<div className="column">
|
||||
<div>
|
||||
{ content() }
|
||||
</div>
|
||||
<OperationStatus lastOperation={ lastOperation }
|
||||
inProgress = { inProgress }
|
||||
operationResult = { operationResult }/>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
ReactDOM.render(<App/>, document.getElementById('reactApp'));
|
||||
@@ -1,105 +0,0 @@
|
||||
import React, { useState, useEffect } from 'react';
|
||||
import ReactDOM from 'react-dom';
|
||||
import Loader from "react-loader-spinner";
|
||||
|
||||
function walpos_to_int(walpos)
|
||||
{
|
||||
const [hi, lo] = walpos.split('/');
|
||||
|
||||
return parseInt(hi, 16) + parseInt(lo, 16);
|
||||
}
|
||||
|
||||
const palette = [
|
||||
"#003f5c",
|
||||
"#2f4b7c",
|
||||
"#665191",
|
||||
"#a05195",
|
||||
"#d45087",
|
||||
"#f95d6a",
|
||||
"#ff7c43",
|
||||
"#ffa600"];
|
||||
|
||||
function WalRecord(props)
|
||||
{
|
||||
const firstwalpos = props.firstwalpos;
|
||||
const endwalpos = props.endwalpos;
|
||||
const record = props.record;
|
||||
const index = props.index;
|
||||
const xidmap = props.xidmap;
|
||||
|
||||
const startpos = walpos_to_int(record.start)
|
||||
const endpos = walpos_to_int(record.end)
|
||||
|
||||
const scale = 1000 / (16*1024*1024)
|
||||
const startx = (startpos - firstwalpos) * scale;
|
||||
const endx = (endpos - firstwalpos) * scale;
|
||||
|
||||
const xidindex = xidmap[record.xid];
|
||||
const color = palette[index % palette.length];
|
||||
|
||||
const y = 5 + (xidindex) * 20 + (index % 2) * 2;
|
||||
|
||||
return (
|
||||
<line x1={ startx } y1={y} x2={endx} y2={y} stroke={ color } strokeWidth="5">
|
||||
<title>
|
||||
start: { record.start } end: { record.end }
|
||||
</title>
|
||||
</line>
|
||||
)
|
||||
}
|
||||
|
||||
function WalFile(props)
|
||||
{
|
||||
const walContent = props.walContent;
|
||||
const firstwalpos = props.firstwalpos;
|
||||
const xidmap = props.xidmap;
|
||||
|
||||
return <svg width="1000" height="200">
|
||||
{
|
||||
walContent.records ?
|
||||
walContent.records.map((record, index) =>
|
||||
<WalRecord key={record.start} firstwalpos={firstwalpos} record={record} index={index} xidmap={xidmap}/>
|
||||
) : "no records"
|
||||
}
|
||||
</svg>
|
||||
}
|
||||
|
||||
function WalDumpApp()
|
||||
{
|
||||
const [walContent, setWalContent] = useState({});
|
||||
|
||||
const filename = '00000001000000000000000C';
|
||||
|
||||
useEffect(() => {
|
||||
fetch('/fetch_wal?filename='+filename).then(res => res.json()).then(data => {
|
||||
setWalContent(data);
|
||||
});
|
||||
}, []);
|
||||
|
||||
var firstwalpos = 0;
|
||||
var endwalpos = 0;
|
||||
var numxids = 0;
|
||||
var xidmap = {};
|
||||
if (walContent.records && walContent.records.length > 0)
|
||||
{
|
||||
firstwalpos = walpos_to_int(walContent.records[0].start);
|
||||
endwalpos = firstwalpos + 16*1024*1024;
|
||||
|
||||
walContent.records.forEach(rec => {
|
||||
if (!xidmap[rec.xid])
|
||||
{
|
||||
xidmap[rec.xid] = ++numxids;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return (
|
||||
<>
|
||||
<h2>{filename}</h2>
|
||||
<WalFile walContent={walContent} firstwalpos={firstwalpos} endwalpos={endwalpos} xidmap={xidmap}/>
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
console.log('hey there');
|
||||
ReactDOM.render(<WalDumpApp/>, document.getElementById('waldump'));
|
||||
@@ -1,9 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# NOTE: You must set the following environment variables before running this:
|
||||
# BASIC_AUTH_PASSWORD - basic http auth password
|
||||
# S3_ACCESSKEY
|
||||
# S3_SECRET
|
||||
|
||||
|
||||
S3_ENDPOINT=https://storage.googleapis.com S3_BUCKET=zenith-testbucket PATH=/home/heikki/pgsql-install/bin:$PATH flask run --host=0.0.0.0
|
||||
@@ -1,8 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# NOTE: You should set the BASIC_AUTH_PASSWORD environment variable before calling
|
||||
|
||||
# Launch S3 server
|
||||
(cd ms3 && python3 -m ms3.app --listen-address=localhost) &
|
||||
|
||||
FLASK_ENV=development S3_REGION=auto S3_ENDPOINT=http://localhost:9009 S3_BUCKET=zenith-testbucket PATH=/home/heikki/pgsql.fsmfork/bin:$PATH flask run --host=0.0.0.0
|
||||
6144
mgmt-console/package-lock.json
generated
6144
mgmt-console/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -1,27 +0,0 @@
|
||||
{
|
||||
"name": "starter-kit",
|
||||
"version": "1.1.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1",
|
||||
"build": "webpack",
|
||||
"start": "python app.py"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"react": "^17.0.1",
|
||||
"react-dom": "^17.0.1",
|
||||
"react-loader-spinner": "^4.0.0",
|
||||
"react-router": "^5.2.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@babel/core": "^7.13.1",
|
||||
"@babel/preset-env": "^7.13.5",
|
||||
"@babel/preset-react": "^7.12.13",
|
||||
"babel-loader": "^8.2.2",
|
||||
"webpack": "^5.24.2",
|
||||
"webpack-cli": "^4.5.0"
|
||||
}
|
||||
}
|
||||
@@ -1,58 +0,0 @@
|
||||
<head>
|
||||
|
||||
<style>
|
||||
.status {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
}
|
||||
.shellcommand {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
}
|
||||
.result {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
|
||||
.todo {font-style: italic;}
|
||||
|
||||
|
||||
h1 {color: blue;}
|
||||
|
||||
.column {
|
||||
float: left;
|
||||
width: 50%;
|
||||
padding: 10px;
|
||||
}
|
||||
/* Clear floats after the columns */
|
||||
.row:after {
|
||||
content: "";
|
||||
display: table;
|
||||
clear: both;
|
||||
}
|
||||
|
||||
.sidenav {
|
||||
float: left;
|
||||
width: 150px;
|
||||
padding: 10px;
|
||||
background-color: pink;
|
||||
}
|
||||
|
||||
.sidenav-item {
|
||||
padding:10px 0px;
|
||||
border:none;
|
||||
display:block;
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="reactApp"></div>
|
||||
|
||||
<!-- Attach React components -->
|
||||
<script type="text/javascript" src="{{ url_for('static', filename='app_bundle.js') }}"></script>
|
||||
</body>
|
||||
@@ -1,46 +0,0 @@
|
||||
<head>
|
||||
|
||||
<style>
|
||||
.status {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
}
|
||||
.shellcommand {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
}
|
||||
.result {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
padding: 10px;
|
||||
}
|
||||
h1 {color: blue;}
|
||||
p {color: red;}
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
.row {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
/* Create two equal columns that sits next to each other */
|
||||
.column1 {
|
||||
flex: 30%;
|
||||
padding: 10px;
|
||||
}
|
||||
.column2 {
|
||||
flex: 70%;
|
||||
padding: 10px;
|
||||
}
|
||||
</style>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="waldump"></div>
|
||||
|
||||
<!-- Attach React components -->
|
||||
<script type="text/javascript" src="{{ url_for('static', filename='waldump_bundle.js') }}"></script>
|
||||
</body>
|
||||
@@ -1,25 +0,0 @@
|
||||
#
|
||||
# This file contains work-in-progress code to visualize WAL contents.
|
||||
#
|
||||
# This is the API endpoint that calls a 'zenith_wal_to_json' executable,
|
||||
# which is a hacked version of pg_waldump that prints information about the
|
||||
# records in JSON format. The code in js/waldump.js displays it.
|
||||
#
|
||||
|
||||
import os
|
||||
import re
|
||||
from subprocess import PIPE, STDOUT, run, Popen
|
||||
|
||||
def fetch_wal(request, s3bucket):
|
||||
filename = request.args.get('filename')
|
||||
if not re.match("^[A-Za-z0-9_]+$", filename):
|
||||
raise Exception('invalid WAL filename: ' + filename)
|
||||
|
||||
# FIXME: this downloads the WAL file to current dir. Use a temp dir? Pipe?
|
||||
s3bucket.download_file('walarchive/' + filename, filename)
|
||||
|
||||
result = run("zenith_wal_to_json " + filename, stdout=PIPE, universal_newlines=True, shell=True)
|
||||
|
||||
os.unlink(filename);
|
||||
|
||||
return result.stdout
|
||||
@@ -1,27 +0,0 @@
|
||||
var webpack = require('webpack');
|
||||
module.exports = {
|
||||
entry: {
|
||||
app: './js/app.js',
|
||||
waldump: './js/waldump.js'
|
||||
},
|
||||
output: {
|
||||
filename: "[name]_bundle.js",
|
||||
path: __dirname + '/static'
|
||||
},
|
||||
module: {
|
||||
rules: [
|
||||
{
|
||||
test: /\.js?$/,
|
||||
exclude: /node_modules/,
|
||||
use: {
|
||||
loader: 'babel-loader',
|
||||
options: {
|
||||
presets: ['@babel/preset-env']
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
plugins: [
|
||||
]
|
||||
};
|
||||
@@ -1,179 +0,0 @@
|
||||
#zenith.py
|
||||
import click
|
||||
import testgres
|
||||
import os
|
||||
|
||||
from testgres import PostgresNode
|
||||
from tabulate import tabulate
|
||||
|
||||
zenith_base_dir = '/home/anastasia/zenith/basedir'
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Run the Zenith CLI."""
|
||||
|
||||
@click.group()
|
||||
def pg():
|
||||
"""Db operations
|
||||
|
||||
NOTE: 'database' here means one postgresql node
|
||||
"""
|
||||
|
||||
@click.command(name='create')
|
||||
@click.option('--name', required=True)
|
||||
@click.option('-s', '--storage-name', help='Name of the storage',
|
||||
default='zenith-local',
|
||||
show_default=True)
|
||||
@click.option('--snapshot', help='init from the snapshot. Snap is a name or URL')
|
||||
@click.option('--no-start', is_flag=True, help='Do not start created node',
|
||||
default=False, show_default=True)
|
||||
def pg_create(name, storage_name, snapshot, no_start):
|
||||
"""Initialize the database"""
|
||||
node = PostgresNode()
|
||||
base_dir = os.path.join(zenith_base_dir, 'pg', name)
|
||||
node = testgres.get_new_node(name, base_dir=base_dir)
|
||||
# TODO skip init, instead of that link node with storage or upload it from snapshot
|
||||
node.init()
|
||||
if(no_start==False):
|
||||
node.start()
|
||||
|
||||
@click.command(name='start')
|
||||
@click.option('--name', required=True)
|
||||
@click.option('--snapshot')
|
||||
@click.option('--read-only', is_flag=True, help='Start read-only node', show_default=True)
|
||||
def pg_start(name, snapshot, read_only):
|
||||
"""Start the database"""
|
||||
node = PostgresNode()
|
||||
base_dir = os.path.join(zenith_base_dir, 'pg', name)
|
||||
node = testgres.get_new_node(name, base_dir=base_dir)
|
||||
# TODO pass snapshot as a parameter
|
||||
node.start()
|
||||
|
||||
@click.command(name='stop')
|
||||
@click.option('--name', required=True)
|
||||
def pg_stop(name):
|
||||
"""Stop the database"""
|
||||
node = PostgresNode()
|
||||
base_dir = os.path.join(zenith_base_dir, 'pg', name)
|
||||
node = testgres.get_new_node(name, base_dir=base_dir)
|
||||
node.stop()
|
||||
|
||||
@click.command(name='destroy')
|
||||
@click.option('--name', required=True)
|
||||
def pg_destroy(name):
|
||||
"""Drop the database"""
|
||||
node = PostgresNode()
|
||||
base_dir = os.path.join(zenith_base_dir, 'pg', name)
|
||||
node = testgres.get_new_node(name, base_dir=base_dir)
|
||||
node.cleanup()
|
||||
|
||||
@click.command(name='list')
|
||||
def pg_list():
|
||||
"""List existing databases"""
|
||||
dirs = os.listdir(os.path.join(zenith_base_dir, 'pg'))
|
||||
path={}
|
||||
status={}
|
||||
data=[]
|
||||
|
||||
for dirname in dirs:
|
||||
path[dirname] = os.path.join(zenith_base_dir, 'pg', dirname)
|
||||
fname = os.path.join( path[dirname], 'data/postmaster.pid')
|
||||
try:
|
||||
f = open(fname,'r')
|
||||
status[dirname] = f.readlines()[-1]
|
||||
except OSError as err:
|
||||
status[dirname]='inactive'
|
||||
data.append([dirname , status[dirname], path[dirname]])
|
||||
|
||||
print(tabulate(data, headers=['Name', 'Status', 'Path']))
|
||||
|
||||
pg.add_command(pg_create)
|
||||
pg.add_command(pg_destroy)
|
||||
pg.add_command(pg_start)
|
||||
pg.add_command(pg_stop)
|
||||
pg.add_command(pg_list)
|
||||
|
||||
|
||||
|
||||
@click.group()
|
||||
def storage():
|
||||
"""Storage operations"""
|
||||
|
||||
@click.command(name='attach')
|
||||
@click.option('--name')
|
||||
def storage_attach(name):
|
||||
"""Attach the storage"""
|
||||
|
||||
@click.command(name='detach')
|
||||
@click.option('--name')
|
||||
@click.option('--force', is_flag=True, show_default=True)
|
||||
def storage_detach(name):
|
||||
"""Detach the storage"""
|
||||
|
||||
@click.command(name='list')
|
||||
def storage_list():
|
||||
"""List existing storages"""
|
||||
|
||||
storage.add_command(storage_attach)
|
||||
storage.add_command(storage_detach)
|
||||
storage.add_command(storage_list)
|
||||
|
||||
@click.group()
|
||||
def snapshot():
|
||||
"""Snapshot operations"""
|
||||
|
||||
@click.command(name='create')
|
||||
def snapshot_create():
|
||||
"""Create new snapshot"""
|
||||
|
||||
@click.command(name='destroy')
|
||||
def snapshot_destroy():
|
||||
"""Destroy the snapshot"""
|
||||
|
||||
@click.command(name='pull')
|
||||
def snapshot_pull():
|
||||
"""Pull remote snapshot"""
|
||||
|
||||
@click.command(name='push')
|
||||
def snapshot_push():
|
||||
"""Push snapshot to remote"""
|
||||
|
||||
@click.command(name='import')
|
||||
def snapshot_import():
|
||||
"""Convert given format to zenith snapshot"""
|
||||
|
||||
@click.command(name='export')
|
||||
def snapshot_export():
|
||||
"""Convert zenith snapshot to PostgreSQL compatible format"""
|
||||
|
||||
snapshot.add_command(snapshot_create)
|
||||
snapshot.add_command(snapshot_destroy)
|
||||
snapshot.add_command(snapshot_pull)
|
||||
snapshot.add_command(snapshot_push)
|
||||
snapshot.add_command(snapshot_import)
|
||||
snapshot.add_command(snapshot_export)
|
||||
|
||||
@click.group()
|
||||
def wal():
|
||||
"""WAL operations"""
|
||||
|
||||
@click.command()
|
||||
def wallist(name="list"):
|
||||
"""List WAL files"""
|
||||
|
||||
wal.add_command(wallist)
|
||||
|
||||
|
||||
@click.command()
|
||||
def console():
|
||||
"""Open web console"""
|
||||
|
||||
main.add_command(pg)
|
||||
main.add_command(storage)
|
||||
main.add_command(snapshot)
|
||||
main.add_command(wal)
|
||||
main.add_command(console)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
2373
pageserver/Cargo.lock
generated
2373
pageserver/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -8,12 +8,10 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
crossbeam-channel = "0.5.0"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
bytes = { version = "1.0.1", features = ['serde'] }
|
||||
byteorder = "1.4.3"
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3.13"
|
||||
lazy_static = "1.4.0"
|
||||
slog-stdlog = "4.1.0"
|
||||
@@ -26,11 +24,26 @@ clap = "2.33.0"
|
||||
termion = "1.5.6"
|
||||
tui = "0.14.0"
|
||||
daemonize = "0.4.1"
|
||||
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", features = ["no-verify-ssl"] }
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
tokio = { version = "1.3.0", features = ["full"] }
|
||||
tokio-stream = { version = "0.1.4" }
|
||||
tokio-postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres-protocol = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
# by default rust-rocksdb tries to build a lot of compression algos. Use lz4 only for now as it is simplest dependency.
|
||||
rocksdb = { version = "0.16.0", features = ["lz4"], default-features = false }
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
walkdir = "2"
|
||||
thiserror = "1.0"
|
||||
hex = "0.4.3"
|
||||
tar = "0.4.33"
|
||||
humantime = "2.1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
fs_extra = "1.2.0"
|
||||
toml = "0.5"
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
@@ -1,82 +1,4 @@
|
||||
Page Server
|
||||
===========
|
||||
|
||||
|
||||
How to test
|
||||
-----------
|
||||
|
||||
|
||||
1. Compile and install Postgres from this repository (there are
|
||||
modifications, so vanilla Postgres won't do)
|
||||
|
||||
./configure --prefix=/home/heikki/zenith-install
|
||||
|
||||
2. Compile the page server
|
||||
|
||||
cd pageserver
|
||||
cargo build
|
||||
|
||||
3. Create another "dummy" cluster that will be used by the page server when it applies
|
||||
the WAL records. (shouldn't really need this, getting rid of it is a TODO):
|
||||
|
||||
/home/heikki/zenith-install/bin/initdb -D /data/zenith-dummy
|
||||
|
||||
|
||||
4. Initialize and start a new postgres cluster
|
||||
|
||||
/home/heikki/zenith-install/bin/initdb -D /data/zenith-test-db --username=postgres
|
||||
/home/heikki/zenith-install/bin/postgres -D /data/zenith-test-db
|
||||
|
||||
5. In another terminal, start the page server.
|
||||
|
||||
PGDATA=/data/zenith-dummy PATH=/home/heikki/zenith-install/bin:$PATH ./target/debug/pageserver
|
||||
|
||||
It should connect to the postgres instance using streaming replication, and print something
|
||||
like this:
|
||||
|
||||
$ PGDATA=/data/zenith-dummy PATH=/home/heikki/zenith-install/bin:$PATH ./target/debug/pageserver
|
||||
Starting WAL receiver
|
||||
connecting...
|
||||
Starting page server on 127.0.0.1:5430
|
||||
connected!
|
||||
page cache is empty
|
||||
|
||||
6. You can now open another terminal and issue DDL commands. Generated WAL records will
|
||||
be streamed to the page servers, and attached to blocks that they apply to in its
|
||||
page cache
|
||||
|
||||
$ psql postgres -U postgres
|
||||
psql (14devel)
|
||||
Type "help" for help.
|
||||
|
||||
postgres=# create table mydata (i int4);
|
||||
CREATE TABLE
|
||||
postgres=# insert into mydata select g from generate_series(1,100) g;
|
||||
INSERT 0 100
|
||||
postgres=#
|
||||
|
||||
7. The GetPage@LSN interface to the compute nodes isn't working yet, but to simulate
|
||||
that, the page server generates a test GetPage@LSN call every 5 seconds on a random
|
||||
block that's in the page cache. In a few seconds, you should see output from that:
|
||||
|
||||
testing GetPage@LSN for block 0
|
||||
WAL record at LSN 23584576 initializes the page
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167DF40
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167DF80
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167DFC0
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167E018
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167E058
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167E098
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167E0D8
|
||||
2021-03-19 11:03:13.792 EET [11439] LOG: applied WAL record at 0/167E118
|
||||
2021-03-19 11:03:13.792 EET [11439] LOG: applied WAL record at 0/167E158
|
||||
2021-03-19 11:03:13.792 EET [11439] LOG: applied WAL record at 0/167E198
|
||||
applied 10 WAL records to produce page image at LSN 18446744073709547246
|
||||
|
||||
|
||||
|
||||
Architecture
|
||||
============
|
||||
## Page server architecture
|
||||
|
||||
The Page Server is responsible for all operations on a number of
|
||||
"chunks" of relation data. A chunk corresponds to a PostgreSQL
|
||||
@@ -84,8 +6,10 @@ relation segment (i.e. one max. 1 GB file in the data directory), but
|
||||
it holds all the different versions of every page in the segment that
|
||||
are still needed by the system.
|
||||
|
||||
Determining which chunk each Page Server holds is handled elsewhere. (TODO:
|
||||
currently, there is only one Page Server which holds all chunks)
|
||||
Currently we do not specifically organize data in chunks.
|
||||
All page images and corresponding WAL records are stored as entries in a key-value storage,
|
||||
where StorageKey is a zenith_timeline_id + BufferTag + LSN.
|
||||
|
||||
|
||||
The Page Server has a few different duties:
|
||||
|
||||
@@ -154,11 +78,33 @@ and stores them to the page cache.
|
||||
Page Cache
|
||||
----------
|
||||
|
||||
The Page Cache is a data structure, to hold all the different page versions.
|
||||
It is accessed by all the other threads, to perform their duties.
|
||||
The Page Cache is a switchboard to access different Repositories.
|
||||
|
||||
Currently, the page cache is implemented fully in-memory. TODO: Store it
|
||||
on disk. Define a file format.
|
||||
#### Repository
|
||||
Repository corresponds to one .zenith directory.
|
||||
Repository is needed to manage Timelines.
|
||||
|
||||
#### Timeline
|
||||
Timeline is a page cache workhorse that accepts page changes
|
||||
and serves get_page_at_lsn() and get_rel_size() requests.
|
||||
Note: this has nothing to do with PostgreSQL WAL timeline.
|
||||
|
||||
#### Branch
|
||||
We can create branch at certain LSN.
|
||||
Each Branch lives in a corresponding timeline and has an ancestor.
|
||||
|
||||
To get full snapshot of data at certain moment we need to traverse timeline and its ancestors.
|
||||
|
||||
#### ObjectRepository
|
||||
ObjectRepository implements Repository and has associated ObjectStore and WAL redo service.
|
||||
|
||||
#### ObjectStore
|
||||
ObjectStore is an interface for key-value store for page images and wal records.
|
||||
Currently it has one implementation - RocksDB.
|
||||
|
||||
#### WAL redo service
|
||||
WAL redo service - service that runs PostgreSQL in a special wal_redo mode
|
||||
to apply given WAL records over an old page image and return new page image.
|
||||
|
||||
|
||||
TODO: Garbage Collection / Compaction
|
||||
@@ -177,3 +123,7 @@ The backup service is responsible for periodically pushing the chunks to S3.
|
||||
TODO: How/when do restore from S3? Whenever we get a GetPage@LSN request for
|
||||
a chunk we don't currently have? Or when an external Control Plane tells us?
|
||||
|
||||
TODO: Sharding
|
||||
--------------------
|
||||
|
||||
We should be able to run multiple Page Servers that handle sharded data.
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
//
|
||||
// Triggers postgres build if there is no postgres binary present at
|
||||
// 'REPO_ROOT/tmp_install/bin/postgres'.
|
||||
//
|
||||
// I can see a lot of disadvantages with such automatization and main
|
||||
// advantage here is ability to build everything and run integration tests
|
||||
// in a bare repo by running 'cargo test'.
|
||||
//
|
||||
// We can interceipt whether it is debug or release build and run
|
||||
// corresponding pg build. But it seems like an overkill for now.
|
||||
//
|
||||
// Problem #1 -- language server in my editor likes calling 'cargo build'
|
||||
// by himself. So if I delete tmp_install directory it would magically reappear
|
||||
// after some time. During this compilation 'cargo build' may whine about
|
||||
// "waiting for file lock on build directory".
|
||||
//
|
||||
// Problem #2 -- cargo build would run this only if something is changed in
|
||||
// the crate.
|
||||
//
|
||||
// And generally speaking postgres is not a build dependency for the pageserver,
|
||||
// just for integration tests. So let's not mix that. I'll leave this file in
|
||||
// place for some time just in case if anybody would start doing the same.
|
||||
//
|
||||
|
||||
// use std::path::Path;
|
||||
// use std::process::{Command};
|
||||
|
||||
fn main() {
|
||||
// // build some postgres if it is not done none yet
|
||||
// if !Path::new("../tmp_install/bin/postgres").exists() {
|
||||
// let make_res = Command::new("make")
|
||||
// .arg("postgres")
|
||||
// .env_clear()
|
||||
// .status()
|
||||
// .expect("failed to execute 'make postgres'");
|
||||
|
||||
// if !make_res.success() {
|
||||
// panic!("postgres build failed");
|
||||
// }
|
||||
// }
|
||||
}
|
||||
@@ -1,62 +0,0 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Set up a simple Compute Node + Page Server combination locally.
|
||||
#
|
||||
# NOTE: This doesn't clean up between invocations. You'll need to manually:
|
||||
#
|
||||
# - Kill any previous 'postgres' and 'pageserver' processes
|
||||
# - Clear the S3 bucket
|
||||
# - Remove the 'zenith-pgdata' directory
|
||||
|
||||
|
||||
set -e
|
||||
|
||||
# Set up some config.
|
||||
#
|
||||
# CHANGE THESE ACCORDING TO YOUR S3 INSTALLATION
|
||||
export S3_REGION=auto
|
||||
export S3_ENDPOINT=https://localhost:9000
|
||||
export S3_ACCESSKEY=minioadmin
|
||||
export S3_SECRET=pikkunen
|
||||
export S3_BUCKET=zenith-testbucket
|
||||
|
||||
|
||||
COMPUTE_NODE_PGDATA=zenith-pgdata
|
||||
|
||||
|
||||
# 1. Initialize a cluster.
|
||||
initdb -D $COMPUTE_NODE_PGDATA -U zenith
|
||||
|
||||
echo "port=65432" >> $COMPUTE_NODE_PGDATA/postgresql.conf
|
||||
echo "log_connections=on" >> $COMPUTE_NODE_PGDATA/postgresql.conf
|
||||
|
||||
# Use a small shared_buffers, so that we hit the Page Server more
|
||||
# easily.
|
||||
echo "shared_buffers = 1MB" >> $COMPUTE_NODE_PGDATA/postgresql.conf
|
||||
|
||||
# TODO: page server should use a replication slot, or some other mechanism
|
||||
# to make sure that the primary doesn't lose data that the page server still
|
||||
# needs. (The WAL safekeepers should ensure that)
|
||||
echo "wal_keep_size=10GB" >> $COMPUTE_NODE_PGDATA/postgresql.conf
|
||||
|
||||
# Tell the Postgres server how to connect to the Page Server
|
||||
echo "page_server_connstring='host=localhost port=5430'" >> $COMPUTE_NODE_PGDATA/postgresql.conf
|
||||
|
||||
|
||||
# 2. Run zenith_push to push a base backup fo the database to an S3 bucket. The
|
||||
# Page Server will read it from there
|
||||
zenith_push -D $COMPUTE_NODE_PGDATA
|
||||
|
||||
|
||||
# 3. Launch page server
|
||||
rm -rf /tmp/pgdata-dummy
|
||||
initdb -N -D /tmp/pgdata-dummy
|
||||
PGDATA=/tmp/pgdata-dummy ./target/debug/pageserver &
|
||||
|
||||
# 4. Start up the Postgres server
|
||||
postgres -D $COMPUTE_NODE_PGDATA &
|
||||
|
||||
|
||||
echo "ALL SET! You can now connect to Postgres with something like:"
|
||||
echo ""
|
||||
echo 'psql "dbname=postgres host=localhost user=zenith port=65432"'
|
||||
346
pageserver/src/basebackup.rs
Normal file
346
pageserver/src/basebackup.rs
Normal file
@@ -0,0 +1,346 @@
|
||||
//!
|
||||
//! Generate a tarball with files needed to bootstrap ComputeNode.
|
||||
//!
|
||||
//! TODO: this module has nothing to do with PostgreSQL pg_basebackup.
|
||||
//! It could use a better name.
|
||||
//!
|
||||
//! Stateless Postgres compute node is launched by sending tarball which contains non-relational data (multixacts, clog, filenodemaps, twophase files)
|
||||
//! and generate pg_control and dummy segment of WAL. This module is responsible for creation of such tarball from snapshot directory and
|
||||
//! data stored in object storage.
|
||||
//!
|
||||
use crate::ZTimelineId;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use log::*;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use tar::{Builder, Header};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::repository::Timeline;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
/// created mostly to avoid passing a lot of parameters between various functions
|
||||
/// used for constructing tarball.
|
||||
pub struct Basebackup<'a> {
|
||||
ar: Builder<&'a mut dyn Write>,
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
snappath: String,
|
||||
slru_buf: [u8; pg_constants::SLRU_SEG_SIZE],
|
||||
slru_segno: u32,
|
||||
slru_path: &'static str,
|
||||
}
|
||||
|
||||
impl<'a> Basebackup<'a> {
|
||||
pub fn new(
|
||||
write: &'a mut dyn Write,
|
||||
timelineid: ZTimelineId,
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
snapshot_lsn: Lsn,
|
||||
) -> Basebackup<'a> {
|
||||
Basebackup {
|
||||
ar: Builder::new(write),
|
||||
timeline,
|
||||
lsn,
|
||||
prev_record_lsn,
|
||||
snappath: format!("timelines/{}/snapshots/{:016X}", timelineid, snapshot_lsn.0),
|
||||
slru_path: "",
|
||||
slru_segno: u32::MAX,
|
||||
slru_buf: [0u8; pg_constants::SLRU_SEG_SIZE],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn send_tarball(&mut self) -> anyhow::Result<()> {
|
||||
debug!("sending tarball of snapshot in {}", self.snappath);
|
||||
for entry in WalkDir::new(&self.snappath) {
|
||||
let entry = entry?;
|
||||
let fullpath = entry.path();
|
||||
let relpath = entry.path().strip_prefix(&self.snappath).unwrap();
|
||||
|
||||
if relpath.to_str().unwrap() == "" {
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.file_type().is_dir() {
|
||||
trace!(
|
||||
"sending dir {} as {}",
|
||||
fullpath.display(),
|
||||
relpath.display()
|
||||
);
|
||||
self.ar.append_dir(relpath, fullpath)?;
|
||||
} else if entry.file_type().is_symlink() {
|
||||
error!("ignoring symlink in snapshot dir");
|
||||
} else if entry.file_type().is_file() {
|
||||
if !is_rel_file_path(relpath.to_str().unwrap()) {
|
||||
if entry.file_name() != "pg_filenode.map" // this files will be generated from object storage
|
||||
&& !relpath.starts_with("pg_xact/")
|
||||
&& !relpath.starts_with("pg_multixact/")
|
||||
{
|
||||
trace!("sending {}", relpath.display());
|
||||
self.ar.append_path_with_name(fullpath, relpath)?;
|
||||
}
|
||||
} else {
|
||||
// relation pages are loaded on demand and should not be included in tarball
|
||||
trace!("not sending {}", relpath.display());
|
||||
}
|
||||
} else {
|
||||
error!("unknown file type: {}", fullpath.display());
|
||||
}
|
||||
}
|
||||
|
||||
// Generate non-relational files.
|
||||
// Iteration is sorted order: all objects of the same time are grouped and traversed
|
||||
// in key ascending order. For example all pg_xact records precede pg_multixact records and are sorted by block number.
|
||||
// It allows to easily construct SLRU segments (32 blocks).
|
||||
for obj in self.timeline.list_nonrels(self.lsn)? {
|
||||
match obj {
|
||||
ObjectTag::Clog(slru) => self.add_slru_segment("pg_xact", &obj, slru.blknum)?,
|
||||
ObjectTag::MultiXactMembers(slru) => {
|
||||
self.add_slru_segment("pg_multixact/members", &obj, slru.blknum)?
|
||||
}
|
||||
ObjectTag::MultiXactOffsets(slru) => {
|
||||
self.add_slru_segment("pg_multixact/offsets", &obj, slru.blknum)?
|
||||
}
|
||||
ObjectTag::FileNodeMap(db) => self.add_relmap_file(&obj, &db)?,
|
||||
ObjectTag::TwoPhase(prepare) => self.add_twophase_file(&obj, prepare.xid)?,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
self.finish_slru_segment()?; // write last non-completed SLRU segment (if any)
|
||||
self.add_pgcontrol_file()?;
|
||||
self.ar.finish()?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Generate SLRU segment files from repository. Path identifies SLRU kind (pg_xact, pg_multixact/members, ...).
|
||||
// Intially pass an empty string.
|
||||
//
|
||||
fn add_slru_segment(
|
||||
&mut self,
|
||||
path: &'static str,
|
||||
tag: &ObjectTag,
|
||||
page: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_page_at_lsn_nowait(*tag, self.lsn, false)?;
|
||||
// Zero length image indicates truncated segment: just skip it
|
||||
if !img.is_empty() {
|
||||
assert!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
let segno = page / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if self.slru_path != "" && (self.slru_segno != segno || self.slru_path != path) {
|
||||
// Switch to new segment: save old one
|
||||
let segname = format!("{}/{:>04X}", self.slru_path, self.slru_segno);
|
||||
let header = new_tar_header(&segname, pg_constants::SLRU_SEG_SIZE as u64)?;
|
||||
self.ar.append(&header, &self.slru_buf[..])?;
|
||||
self.slru_buf = [0u8; pg_constants::SLRU_SEG_SIZE]; // reinitialize segment buffer
|
||||
}
|
||||
self.slru_segno = segno;
|
||||
self.slru_path = path;
|
||||
let offs_start = (page % pg_constants::SLRU_PAGES_PER_SEGMENT) as usize
|
||||
* pg_constants::BLCKSZ as usize;
|
||||
let offs_end = offs_start + pg_constants::BLCKSZ as usize;
|
||||
self.slru_buf[offs_start..offs_end].copy_from_slice(&img);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// We flush SLRU segments to the tarball once them are completed.
|
||||
// This method is used to flush last (may be incompleted) segment.
|
||||
//
|
||||
fn finish_slru_segment(&mut self) -> anyhow::Result<()> {
|
||||
if self.slru_path != "" {
|
||||
// is there is some incompleted segment
|
||||
let segname = format!("{}/{:>04X}", self.slru_path, self.slru_segno);
|
||||
let header = new_tar_header(&segname, pg_constants::SLRU_SEG_SIZE as u64)?;
|
||||
self.ar.append(&header, &self.slru_buf[..])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Extract pg_filenode.map files from repository
|
||||
//
|
||||
fn add_relmap_file(&mut self, tag: &ObjectTag, db: &DatabaseTag) -> anyhow::Result<()> {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_page_at_lsn_nowait(*tag, self.lsn, false)?;
|
||||
info!("add_relmap_file {:?}", db);
|
||||
let path = if db.spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||
String::from("global/pg_filenode.map") // filenode map for global tablespace
|
||||
} else {
|
||||
// User defined tablespaces are not supported
|
||||
assert!(db.spcnode == pg_constants::DEFAULTTABLESPACE_OID);
|
||||
let src_path = format!("{}/base/1/PG_VERSION", self.snappath);
|
||||
let dst_path = format!("base/{}/PG_VERSION", db.dbnode);
|
||||
self.ar.append_path_with_name(&src_path, &dst_path)?;
|
||||
format!("base/{}/pg_filenode.map", db.dbnode)
|
||||
};
|
||||
assert!(img.len() == 512);
|
||||
let header = new_tar_header(&path, img.len() as u64)?;
|
||||
self.ar.append(&header, &img[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Extract twophase state files
|
||||
//
|
||||
fn add_twophase_file(&mut self, tag: &ObjectTag, xid: TransactionId) -> anyhow::Result<()> {
|
||||
// Include in tarball two-phase files only of in-progress transactions
|
||||
if self.timeline.get_tx_status(xid, self.lsn)?
|
||||
== pg_constants::TRANSACTION_STATUS_IN_PROGRESS
|
||||
{
|
||||
let img = self
|
||||
.timeline
|
||||
.get_page_at_lsn_nowait(*tag, self.lsn, false)?;
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
let crc = crc32c::crc32c(&img[..]);
|
||||
buf.put_u32_le(crc);
|
||||
let path = format!("pg_twophase/{:>08X}", xid);
|
||||
let header = new_tar_header(&path, buf.len() as u64)?;
|
||||
self.ar.append(&header, &buf[..])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Add generated pg_control file
|
||||
//
|
||||
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
let checkpoint_bytes =
|
||||
self.timeline
|
||||
.get_page_at_lsn_nowait(ObjectTag::Checkpoint, self.lsn, false)?;
|
||||
let pg_control_bytes =
|
||||
self.timeline
|
||||
.get_page_at_lsn_nowait(ObjectTag::ControlFile, self.lsn, false)?;
|
||||
let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
|
||||
// Generate new pg_control and WAL needed for bootstrap
|
||||
let checkpoint_segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let checkpoint_lsn = XLogSegNoOffsetToRecPtr(
|
||||
checkpoint_segno,
|
||||
XLOG_SIZE_OF_XLOG_LONG_PHD as u32,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
);
|
||||
checkpoint.redo = self.lsn.0 + self.lsn.calc_padding(8u32);
|
||||
|
||||
//reset some fields we don't want to preserve
|
||||
checkpoint.oldestActiveXid = 0;
|
||||
|
||||
//save new values in pg_control
|
||||
pg_control.checkPoint = checkpoint_lsn;
|
||||
pg_control.checkPointCopy = checkpoint;
|
||||
info!("pg_control.state = {}", pg_control.state);
|
||||
pg_control.state = pg_constants::DB_SHUTDOWNED;
|
||||
|
||||
// add zenith.signal file
|
||||
self.ar.append(
|
||||
&new_tar_header("zenith.signal", 8)?,
|
||||
&self.prev_record_lsn.0.to_le_bytes()[..],
|
||||
)?;
|
||||
|
||||
//send pg_control
|
||||
let pg_control_bytes = pg_control.encode();
|
||||
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
|
||||
self.ar.append(&header, &pg_control_bytes[..])?;
|
||||
|
||||
//send wal segment
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
checkpoint_segno,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
);
|
||||
let wal_file_path = format!("pg_wal/{}", wal_file_name);
|
||||
let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
|
||||
let wal_seg = generate_wal_segment(&pg_control);
|
||||
self.ar.append(&header, &wal_seg[..])?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Parse a path, relative to the root of PostgreSQL data directory, as
|
||||
/// a PostgreSQL relation data file.
|
||||
///
|
||||
fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
|
||||
/*
|
||||
* Relation data files can be in one of the following directories:
|
||||
*
|
||||
* global/
|
||||
* shared relations
|
||||
*
|
||||
* base/<db oid>/
|
||||
* regular relations, default tablespace
|
||||
*
|
||||
* pg_tblspc/<tblspc oid>/<tblspc version>/
|
||||
* within a non-default tablespace (the name of the directory
|
||||
* depends on version)
|
||||
*
|
||||
* And the relation data files themselves have a filename like:
|
||||
*
|
||||
* <oid>.<segment number>
|
||||
*/
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;
|
||||
|
||||
Ok(())
|
||||
} else if let Some(dbpath) = path.strip_prefix("base/") {
|
||||
let mut s = dbpath.split('/');
|
||||
let dbnode_str = s.next().ok_or(FilePathError::InvalidFileName)?;
|
||||
let _dbnode = dbnode_str.parse::<u32>()?;
|
||||
let fname = s.next().ok_or(FilePathError::InvalidFileName)?;
|
||||
if s.next().is_some() {
|
||||
return Err(FilePathError::InvalidFileName);
|
||||
};
|
||||
|
||||
let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;
|
||||
|
||||
Ok(())
|
||||
} else if path.strip_prefix("pg_tblspc/").is_some() {
|
||||
// TODO
|
||||
error!("tablespaces not implemented yet");
|
||||
Err(FilePathError::InvalidFileName)
|
||||
} else {
|
||||
Err(FilePathError::InvalidFileName)
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Check if it is relational file
|
||||
//
|
||||
fn is_rel_file_path(path: &str) -> bool {
|
||||
parse_rel_file_path(path).is_ok()
|
||||
}
|
||||
|
||||
//
|
||||
// Create new tarball entry header
|
||||
//
|
||||
fn new_tar_header(path: &str, size: u64) -> anyhow::Result<Header> {
|
||||
let mut header = Header::new_gnu();
|
||||
header.set_size(size);
|
||||
header.set_path(path)?;
|
||||
header.set_mode(0b110000000); // -rw-------
|
||||
header.set_mtime(
|
||||
// use currenttime as last modified time
|
||||
SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs(),
|
||||
);
|
||||
header.set_cksum();
|
||||
Ok(header)
|
||||
}
|
||||
@@ -1,43 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings};
|
||||
|
||||
pub mod pg;
|
||||
pub mod snapshot;
|
||||
pub mod storage;
|
||||
mod subcommand;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli_commands = subcommand::ClapCommands {
|
||||
commands: vec![
|
||||
Box::new(pg::PgCmd {
|
||||
clap_cmd: clap::SubCommand::with_name("pg"),
|
||||
}),
|
||||
Box::new(storage::StorageCmd {
|
||||
clap_cmd: clap::SubCommand::with_name("storage"),
|
||||
}),
|
||||
Box::new(snapshot::SnapshotCmd {
|
||||
clap_cmd: clap::SubCommand::with_name("snapshot"),
|
||||
}),
|
||||
],
|
||||
};
|
||||
|
||||
let matches = App::new("zenith")
|
||||
.about("Zenith CLI")
|
||||
.version("1.0")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommands(cli_commands.generate())
|
||||
.get_matches();
|
||||
|
||||
if let Some(subcommand) = matches.subcommand_name() {
|
||||
println!("'git {}' was used", subcommand);
|
||||
}
|
||||
|
||||
match matches.subcommand() {
|
||||
("pg", Some(sub_args)) => cli_commands.commands[0].run(sub_args.clone())?,
|
||||
("storage", Some(sub_args)) => cli_commands.commands[1].run(sub_args.clone())?,
|
||||
("snapshot", Some(sub_args)) => cli_commands.commands[2].run(sub_args.clone())?,
|
||||
("", None) => println!("No subcommand"),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,105 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings, Arg};
|
||||
|
||||
use crate::subcommand;
|
||||
|
||||
pub struct PgCmd<'a> {
|
||||
pub clap_cmd: clap::App<'a, 'a>,
|
||||
}
|
||||
|
||||
impl subcommand::SubCommand for PgCmd<'_> {
|
||||
fn gen_clap_command(&self) -> clap::App {
|
||||
let c = self.clap_cmd.clone();
|
||||
c.about("Operations with zenith compute nodes")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(App::new("list").about("List existing compute nodes"))
|
||||
.subcommand(
|
||||
App::new("create")
|
||||
.about(
|
||||
"Create (init) new data directory using given storage and start postgres",
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("storage")
|
||||
.short("s")
|
||||
.long("storage")
|
||||
.takes_value(true)
|
||||
.help("Name of the storage node to use"),
|
||||
)
|
||||
//TODO should it be just name of uploaded snapshot or some path?
|
||||
.arg(
|
||||
Arg::with_name("snapshot")
|
||||
.long("snapshot")
|
||||
.takes_value(true)
|
||||
.help("Name of the snapshot to use"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("nostart")
|
||||
.long("no-start")
|
||||
.takes_value(false)
|
||||
.help("Don't start postgres on the created node"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("destroy")
|
||||
.about("Stop postgres and destroy node's data directory")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("start")
|
||||
.about("Start postgres on the given node")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("replica")
|
||||
.long("replica")
|
||||
.takes_value(false)
|
||||
.help("Start the compute node as replica"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("stop")
|
||||
.about("Stop postgres on the given node")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("show")
|
||||
.about("Show info about the given node")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()> {
|
||||
println!("Run PgCmd with args {:?}", args);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings, Arg};
|
||||
|
||||
use crate::subcommand;
|
||||
|
||||
pub struct SnapshotCmd<'a> {
|
||||
pub clap_cmd: clap::App<'a, 'a>,
|
||||
}
|
||||
|
||||
impl subcommand::SubCommand for SnapshotCmd<'_> {
|
||||
fn gen_clap_command(&self) -> clap::App {
|
||||
let c = self.clap_cmd.clone();
|
||||
c.about("Operations with zenith snapshots")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(App::new("list"))
|
||||
.subcommand(App::new("create").arg(Arg::with_name("pgdata").required(true)))
|
||||
.subcommand(App::new("destroy"))
|
||||
.subcommand(App::new("start"))
|
||||
.subcommand(App::new("stop"))
|
||||
.subcommand(App::new("show"))
|
||||
}
|
||||
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()> {
|
||||
println!("Run SnapshotCmd with args {:?}", args);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings};
|
||||
|
||||
use crate::subcommand;
|
||||
|
||||
pub struct StorageCmd<'a> {
|
||||
pub clap_cmd: clap::App<'a, 'a>,
|
||||
}
|
||||
|
||||
impl subcommand::SubCommand for StorageCmd<'_> {
|
||||
fn gen_clap_command(&self) -> clap::App {
|
||||
let c = self.clap_cmd.clone();
|
||||
c.about("Operations with zenith storage nodes")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(App::new("list"))
|
||||
.subcommand(App::new("attach"))
|
||||
.subcommand(App::new("detach"))
|
||||
.subcommand(App::new("show"))
|
||||
}
|
||||
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()> {
|
||||
println!("Run StorageCmd with args {:?}", args);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
use anyhow::Result;
|
||||
|
||||
/// All subcommands need to implement this interface.
|
||||
pub trait SubCommand {
|
||||
/// Generates the cli-config that Clap requires for the subcommand.
|
||||
fn gen_clap_command(&self) -> clap::App;
|
||||
|
||||
/// Runs the body of the subcommand.
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()>;
|
||||
}
|
||||
|
||||
/// A struct which holds a vector of heap-allocated `Box`es of trait objects all of which must
|
||||
/// implement the `SubCommand` trait, but other than that, can be of any type.
|
||||
pub struct ClapCommands {
|
||||
pub commands: Vec<Box<dyn SubCommand>>,
|
||||
}
|
||||
|
||||
impl ClapCommands {
|
||||
/// Generates a vector of `clap::Apps` that can be passed into clap's `.subcommands()` method in
|
||||
/// order to generate the full CLI.
|
||||
pub fn generate(&self) -> Vec<clap::App> {
|
||||
let mut v: Vec<clap::App> = Vec::new();
|
||||
|
||||
for command in self.commands.iter() {
|
||||
v.push(command.gen_clap_command());
|
||||
}
|
||||
v
|
||||
}
|
||||
}
|
||||
@@ -3,146 +3,286 @@
|
||||
//
|
||||
|
||||
use log::*;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::PathBuf;
|
||||
use std::thread;
|
||||
use std::{fs::File, fs::OpenOptions, str::FromStr};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
env,
|
||||
fs::{File, OpenOptions},
|
||||
io,
|
||||
net::TcpListener,
|
||||
path::{Path, PathBuf},
|
||||
process::exit,
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use clap::{App, Arg};
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use slog;
|
||||
use slog::Drain;
|
||||
use slog_scope;
|
||||
use slog_stdlog;
|
||||
use slog::{Drain, FnValue};
|
||||
|
||||
use pageserver::page_service;
|
||||
use pageserver::restore_s3;
|
||||
use pageserver::tui;
|
||||
use pageserver::walreceiver;
|
||||
use pageserver::PageServerConf;
|
||||
use pageserver::{branches, page_cache, page_service, tui, PageServerConf};
|
||||
|
||||
fn main() -> Result<(), io::Error> {
|
||||
const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:64000";
|
||||
|
||||
const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
|
||||
|
||||
const DEFAULT_WAL_REDOERS: usize = 1;
|
||||
|
||||
/// String arguments that can be declared via CLI or config file
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct CfgFileParams {
|
||||
listen_addr: Option<String>,
|
||||
gc_horizon: Option<String>,
|
||||
gc_period: Option<String>,
|
||||
wal_redoers: Option<String>,
|
||||
pg_distrib_dir: Option<String>,
|
||||
}
|
||||
|
||||
impl CfgFileParams {
|
||||
/// Extract string arguments from CLI
|
||||
fn from_args(arg_matches: &ArgMatches) -> Self {
|
||||
let get_arg = |arg_name: &str| -> Option<String> {
|
||||
arg_matches.value_of(arg_name).map(str::to_owned)
|
||||
};
|
||||
|
||||
Self {
|
||||
listen_addr: get_arg("listen"),
|
||||
gc_horizon: get_arg("gc_horizon"),
|
||||
gc_period: get_arg("gc_period"),
|
||||
wal_redoers: get_arg("wal_redoers"),
|
||||
pg_distrib_dir: get_arg("postgres-distrib"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Fill missing values in `self` with `other`
|
||||
fn or(self, other: CfgFileParams) -> Self {
|
||||
// TODO cleaner way to do this
|
||||
Self {
|
||||
listen_addr: self.listen_addr.or(other.listen_addr),
|
||||
gc_horizon: self.gc_horizon.or(other.gc_horizon),
|
||||
gc_period: self.gc_period.or(other.gc_period),
|
||||
wal_redoers: self.wal_redoers.or(other.wal_redoers),
|
||||
pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a PageServerConf from these string parameters
|
||||
fn try_into_config(&self) -> Result<PageServerConf> {
|
||||
let listen_addr = match self.listen_addr.as_ref() {
|
||||
Some(addr) => addr.clone(),
|
||||
None => DEFAULT_LISTEN_ADDR.to_owned(),
|
||||
};
|
||||
|
||||
let gc_horizon: u64 = match self.gc_horizon.as_ref() {
|
||||
Some(horizon_str) => horizon_str.parse()?,
|
||||
None => DEFAULT_GC_HORIZON,
|
||||
};
|
||||
let gc_period = match self.gc_period.as_ref() {
|
||||
Some(period_str) => humantime::parse_duration(period_str)?,
|
||||
None => DEFAULT_GC_PERIOD,
|
||||
};
|
||||
|
||||
let wal_redoers = match self.wal_redoers.as_ref() {
|
||||
Some(wal_redoers_str) => wal_redoers_str.parse::<usize>()?,
|
||||
None => DEFAULT_WAL_REDOERS,
|
||||
};
|
||||
|
||||
let pg_distrib_dir = match self.pg_distrib_dir.as_ref() {
|
||||
Some(pg_distrib_dir_str) => PathBuf::from(pg_distrib_dir_str),
|
||||
None => env::current_dir()?.join("tmp_install"),
|
||||
};
|
||||
|
||||
if !pg_distrib_dir.join("bin/postgres").exists() {
|
||||
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
|
||||
}
|
||||
|
||||
Ok(PageServerConf {
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
materialize: false,
|
||||
|
||||
listen_addr,
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
wal_redoers,
|
||||
workdir: PathBuf::from("."),
|
||||
|
||||
pg_distrib_dir,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.arg(Arg::with_name("datadir")
|
||||
.short("D")
|
||||
.long("dir")
|
||||
.takes_value(true)
|
||||
.help("Path to the page server data directory"))
|
||||
.arg(Arg::with_name("wal_producer")
|
||||
.short("w")
|
||||
.long("wal-producer")
|
||||
.takes_value(true)
|
||||
.help("connect to the WAL sender (postgres or wal_acceptor) on connstr (default: 'host=127.0.0.1 port=65432 user=zenith')"))
|
||||
.arg(Arg::with_name("listen")
|
||||
.short("l")
|
||||
.long("listen")
|
||||
.takes_value(true)
|
||||
.help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"))
|
||||
.arg(Arg::with_name("interactive")
|
||||
.short("i")
|
||||
.long("interactive")
|
||||
.takes_value(false)
|
||||
.help("Interactive mode"))
|
||||
.arg(Arg::with_name("daemonize")
|
||||
.short("d")
|
||||
.long("daemonize")
|
||||
.takes_value(false)
|
||||
.help("Run in the background"))
|
||||
.arg(Arg::with_name("skip_recovery")
|
||||
.long("skip-recovery")
|
||||
.takes_value(false)
|
||||
.help("Skip S3 recovery procedy and start empty"))
|
||||
.arg(
|
||||
Arg::with_name("listen")
|
||||
.short("l")
|
||||
.long("listen")
|
||||
.takes_value(true)
|
||||
.help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("interactive")
|
||||
.short("i")
|
||||
.long("interactive")
|
||||
.takes_value(false)
|
||||
.help("Interactive mode"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("materialize")
|
||||
.long("materialize")
|
||||
.takes_value(false)
|
||||
.help("Materialize pages constructed by get_page_at"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("daemonize")
|
||||
.short("d")
|
||||
.long("daemonize")
|
||||
.takes_value(false)
|
||||
.help("Run in the background"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("init")
|
||||
.long("init")
|
||||
.takes_value(false)
|
||||
.help("Initialize pageserver repo"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("gc_horizon")
|
||||
.long("gc_horizon")
|
||||
.takes_value(true)
|
||||
.help("Distance from current LSN to perform all wal records cleanup"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("gc_period")
|
||||
.long("gc_period")
|
||||
.takes_value(true)
|
||||
.help("Interval between garbage collector iterations"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("wal_redoers")
|
||||
.long("wal_redoers")
|
||||
.takes_value(true)
|
||||
.help("Number of wal-redo postgres instances"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("workdir")
|
||||
.short("D")
|
||||
.long("workdir")
|
||||
.takes_value(true)
|
||||
.help("Working directory for the pageserver"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("postgres-distrib")
|
||||
.long("postgres-distrib")
|
||||
.takes_value(true)
|
||||
.help("Postgres distribution directory"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
data_dir: PathBuf::from("./"),
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
wal_producer_connstr: None,
|
||||
listen_addr: "127.0.0.1:5430".parse().unwrap(),
|
||||
skip_recovery: false,
|
||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
||||
let cfg_file_path = workdir.canonicalize()?.join("pageserver.toml");
|
||||
|
||||
let args_params = CfgFileParams::from_args(&arg_matches);
|
||||
|
||||
let init = arg_matches.is_present("init");
|
||||
let params = if init {
|
||||
// We're initializing the repo, so there's no config file yet
|
||||
args_params
|
||||
} else {
|
||||
// Supplement the CLI arguments with the config file
|
||||
let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)?;
|
||||
let file_params: CfgFileParams = toml::from_str(&cfg_file_contents)?;
|
||||
args_params.or(file_params)
|
||||
};
|
||||
|
||||
if let Some(dir) = arg_matches.value_of("datadir") {
|
||||
conf.data_dir = PathBuf::from(dir);
|
||||
}
|
||||
// Ensure the config is valid, even if just init-ing
|
||||
let mut conf = params.try_into_config()?;
|
||||
|
||||
if arg_matches.is_present("daemonize") {
|
||||
conf.daemonize = true;
|
||||
}
|
||||
conf.daemonize = arg_matches.is_present("daemonize");
|
||||
conf.interactive = arg_matches.is_present("interactive");
|
||||
conf.materialize = arg_matches.is_present("materialize");
|
||||
|
||||
if arg_matches.is_present("interactive") {
|
||||
conf.interactive = true;
|
||||
if init && (conf.daemonize || conf.interactive) {
|
||||
eprintln!("--daemonize and --interactive may not be used with --init");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if conf.daemonize && conf.interactive {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"--daemonize is not allowed with --interactive: choose one",
|
||||
));
|
||||
eprintln!("--daemonize is not allowed with --interactive: choose one");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if arg_matches.is_present("skip_recovery") {
|
||||
conf.skip_recovery = true;
|
||||
// The configuration is all set up now. Turn it into a 'static
|
||||
// that can be freely stored in structs and passed across threads
|
||||
// as a ref.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
// Create repo and exit if init was requested
|
||||
if init {
|
||||
branches::init_repo(conf, &workdir)?;
|
||||
|
||||
// write the config file
|
||||
let cfg_file_contents = toml::to_string_pretty(¶ms)?;
|
||||
std::fs::write(&cfg_file_path, cfg_file_contents)?;
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("wal_producer") {
|
||||
conf.wal_producer_connstr = Some(String::from_str(addr).unwrap());
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("listen") {
|
||||
conf.listen_addr = addr.parse().unwrap();
|
||||
}
|
||||
// Set CWD to workdir for non-daemon modes
|
||||
env::set_current_dir(&workdir)?;
|
||||
|
||||
start_pageserver(conf)
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: PageServerConf) -> Result<(), io::Error> {
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
let log_filename = "pageserver.log";
|
||||
// Don't open the same file for output multiple times;
|
||||
// the different fds could overwrite each other's output.
|
||||
let log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", &log_filename))?;
|
||||
|
||||
// Initialize logger
|
||||
let _scope_guard = init_logging(&conf);
|
||||
let _log_guard = slog_stdlog::init().unwrap();
|
||||
let logger_file = log_file.try_clone().unwrap();
|
||||
let _scope_guard = init_logging(&conf, logger_file)?;
|
||||
let _log_guard = slog_stdlog::init()?;
|
||||
|
||||
// Note: this `info!(...)` macro comes from `log` crate
|
||||
info!("standard logging redirected to slog");
|
||||
|
||||
let tui_thread: Option<thread::JoinHandle<()>>;
|
||||
if conf.interactive {
|
||||
let tui_thread = if conf.interactive {
|
||||
// Initialize the UI
|
||||
tui_thread = Some(
|
||||
Some(
|
||||
thread::Builder::new()
|
||||
.name("UI thread".into())
|
||||
.spawn(|| {
|
||||
let _ = tui::ui_main();
|
||||
})
|
||||
.unwrap(),
|
||||
);
|
||||
//threads.push(tui_thread);
|
||||
)
|
||||
} else {
|
||||
tui_thread = None;
|
||||
}
|
||||
None
|
||||
};
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fpritf's or backtraces.
|
||||
let stdout = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(conf.data_dir.join("pageserver.log"))
|
||||
.unwrap();
|
||||
let stderr = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(conf.data_dir.join("pageserver.log"))
|
||||
.unwrap();
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
let stdout = log_file.try_clone().unwrap();
|
||||
let stderr = log_file;
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file(conf.data_dir.join("pageserver.pid"))
|
||||
.working_directory(conf.data_dir.clone())
|
||||
.pid_file("pageserver.pid")
|
||||
.working_directory(".")
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
|
||||
@@ -152,88 +292,63 @@ fn start_pageserver(conf: PageServerConf) -> Result<(), io::Error> {
|
||||
}
|
||||
}
|
||||
|
||||
let mut threads = Vec::new();
|
||||
// Check that we can bind to address before starting threads to simplify shutdown
|
||||
// sequence if port is occupied.
|
||||
info!("Starting pageserver on {}", conf.listen_addr);
|
||||
let pageserver_listener = TcpListener::bind(conf.listen_addr.clone())?;
|
||||
|
||||
info!("starting...");
|
||||
// Initialize page cache, this will spawn walredo_thread
|
||||
page_cache::init(conf);
|
||||
|
||||
// Before opening up for connections, restore the latest base backup from S3.
|
||||
// (We don't persist anything to local disk at the moment, so we need to do
|
||||
// this at every startup)
|
||||
if !conf.skip_recovery {
|
||||
restore_s3::restore_main(&conf);
|
||||
}
|
||||
|
||||
// Create directory for wal-redo datadirs
|
||||
match fs::create_dir(conf.data_dir.join("wal-redo")) {
|
||||
Ok(_) => {}
|
||||
Err(e) => match e.kind() {
|
||||
io::ErrorKind::AlreadyExists => {}
|
||||
_ => {
|
||||
panic!("Failed to create wal-redo data directory: {}", e);
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// Launch the WAL receiver thread if pageserver was started with --wal-producer
|
||||
// option. It will try to connect to the WAL safekeeper, and stream the WAL. If
|
||||
// the connection is lost, it will reconnect on its own. We just fire and forget
|
||||
// it here.
|
||||
//
|
||||
// All other wal receivers are started on demand by "callmemaybe" command
|
||||
// sent to pageserver.
|
||||
let conf_copy = conf.clone();
|
||||
if let Some(wal_producer) = conf.wal_producer_connstr {
|
||||
let conf = conf_copy.clone();
|
||||
let walreceiver_thread = thread::Builder::new()
|
||||
.name("static WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
walreceiver::thread_main(conf, &wal_producer);
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(walreceiver_thread);
|
||||
}
|
||||
|
||||
// GetPage@LSN requests are served by another thread. (It uses async I/O,
|
||||
// but the code in page_service sets up it own thread pool for that)
|
||||
let conf = conf_copy.clone();
|
||||
let page_server_thread = thread::Builder::new()
|
||||
// Spawn a thread to listen for connections. It will spawn further threads
|
||||
// for each connection.
|
||||
let page_service_thread = thread::Builder::new()
|
||||
.name("Page Service thread".into())
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
page_service::thread_main(conf);
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(page_server_thread);
|
||||
.spawn(move || page_service::thread_main(conf, pageserver_listener))?;
|
||||
|
||||
if tui_thread.is_some() {
|
||||
if let Some(tui_thread) = tui_thread {
|
||||
// The TUI thread exits when the user asks to Quit.
|
||||
tui_thread.unwrap().join().unwrap();
|
||||
tui_thread.join().unwrap();
|
||||
} else {
|
||||
// In non-interactive mode, wait forever.
|
||||
for t in threads {
|
||||
t.join().unwrap()
|
||||
}
|
||||
page_service_thread
|
||||
.join()
|
||||
.expect("Page service thread has panicked")?
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_logging(conf: &PageServerConf) -> slog_scope::GlobalLoggerGuard {
|
||||
fn init_logging(
|
||||
conf: &PageServerConf,
|
||||
log_file: File,
|
||||
) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
|
||||
if conf.interactive {
|
||||
tui::init_logging()
|
||||
Ok(tui::init_logging())
|
||||
} else if conf.daemonize {
|
||||
let log = conf.data_dir.join("pageserver.log");
|
||||
let log_file = File::create(log).unwrap_or_else(|_| panic!("Could not create log file"));
|
||||
let decorator = slog_term::PlainSyncDecorator::new(log_file);
|
||||
let drain = slog_term::CompactFormat::new(decorator).build();
|
||||
let drain = slog_term::FullFormat::new(decorator).build();
|
||||
let drain = slog::Filter::new(drain, |record: &slog::Record| {
|
||||
if record.level().is_at_least(slog::Level::Info) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
false
|
||||
});
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
slog_scope::set_global_logger(logger)
|
||||
let logger = slog::Logger::root(
|
||||
drain,
|
||||
slog::o!(
|
||||
"location" =>
|
||||
FnValue(move |record| {
|
||||
format!("{}, {}:{}",
|
||||
record.module(),
|
||||
record.file(),
|
||||
record.line()
|
||||
)
|
||||
}
|
||||
)
|
||||
),
|
||||
);
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
} else {
|
||||
let decorator = slog_term::TermDecorator::new().build();
|
||||
let drain = slog_term::FullFormat::new(decorator).build().fuse();
|
||||
@@ -247,10 +362,10 @@ fn init_logging(conf: &PageServerConf) -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
false
|
||||
})
|
||||
.fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
slog_scope::set_global_logger(logger)
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
}
|
||||
}
|
||||
|
||||
433
pageserver/src/branches.rs
Normal file
433
pageserver/src/branches.rs
Normal file
@@ -0,0 +1,433 @@
|
||||
//!
|
||||
//! Branch management code
|
||||
//!
|
||||
// TODO: move all paths construction to conf impl
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use fs::File;
|
||||
use postgres_ffi::{pg_constants, xlog_utils, ControlFileData};
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::env;
|
||||
use std::io::{Read, Write};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
fs, io,
|
||||
path::{Path, PathBuf},
|
||||
process::{Command, Stdio},
|
||||
str::FromStr,
|
||||
};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::restore_local_repo;
|
||||
use crate::{repository::Repository, PageServerConf, ZTimelineId};
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct BranchInfo {
|
||||
pub name: String,
|
||||
pub timeline_id: ZTimelineId,
|
||||
pub latest_valid_lsn: Option<Lsn>,
|
||||
pub ancestor_id: Option<String>,
|
||||
pub ancestor_lsn: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PointInTime {
|
||||
pub timelineid: ZTimelineId,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
pub fn init_repo(conf: &'static PageServerConf, repo_dir: &Path) -> Result<()> {
|
||||
// top-level dir may exist if we are creating it through CLI
|
||||
fs::create_dir_all(repo_dir)
|
||||
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
|
||||
|
||||
env::set_current_dir(repo_dir)?;
|
||||
|
||||
fs::create_dir(std::path::Path::new("timelines"))?;
|
||||
fs::create_dir(std::path::Path::new("refs"))?;
|
||||
fs::create_dir(std::path::Path::new("refs").join("branches"))?;
|
||||
fs::create_dir(std::path::Path::new("refs").join("tags"))?;
|
||||
|
||||
println!("created directory structure in {}", repo_dir.display());
|
||||
|
||||
// Run initdb
|
||||
//
|
||||
// We create the cluster temporarily in a "tmp" directory inside the repository,
|
||||
// and move it to the right location from there.
|
||||
let tmppath = std::path::Path::new("tmp");
|
||||
|
||||
print!("running initdb... ");
|
||||
io::stdout().flush()?;
|
||||
|
||||
let initdb_path = conf.pg_bin_dir().join("initdb");
|
||||
let initdb_otput = Command::new(initdb_path)
|
||||
.args(&["-D", tmppath.to_str().unwrap()])
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.with_context(|| "failed to execute initdb")?;
|
||||
if !initdb_otput.status.success() {
|
||||
anyhow::bail!(
|
||||
"initdb failed: '{}'",
|
||||
String::from_utf8_lossy(&initdb_otput.stderr)
|
||||
);
|
||||
}
|
||||
println!("initdb succeeded");
|
||||
|
||||
// Read control file to extract the LSN and system id
|
||||
let controlfile_path = tmppath.join("global").join("pg_control");
|
||||
let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
|
||||
// let systemid = controlfile.system_identifier;
|
||||
let lsn = controlfile.checkPoint;
|
||||
let lsnstr = format!("{:016X}", lsn);
|
||||
|
||||
// Bootstrap the repository by loading the newly-initdb'd cluster into 'main' branch.
|
||||
let tli = create_timeline(conf, None)?;
|
||||
let timelinedir = conf.timeline_path(tli);
|
||||
|
||||
// We don't use page_cache here, because we don't want to spawn the WAL redo thread during
|
||||
// repository initialization.
|
||||
//
|
||||
// FIXME: That caused trouble, because the WAL redo thread launched initdb in the background,
|
||||
// and it kept running even after the "zenith init" had exited. In tests, we started the
|
||||
// page server immediately after that, so that initdb was still running in the background,
|
||||
// and we failed to run initdb again in the same directory. This has been solved for the
|
||||
// rapid init+start case now, but the general race condition remains if you restart the
|
||||
// server quickly.
|
||||
let storage = crate::rocksdb_storage::RocksObjectStore::create(conf)?;
|
||||
//let storage = crate::inmem_storage::InmemObjectStore::create(conf)?;
|
||||
|
||||
let repo = crate::object_repository::ObjectRepository::new(
|
||||
conf,
|
||||
std::sync::Arc::new(storage),
|
||||
std::sync::Arc::new(crate::walredo::DummyRedoManager {}),
|
||||
);
|
||||
let timeline = repo.create_empty_timeline(tli, Lsn(lsn))?;
|
||||
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(&tmppath, &*timeline, Lsn(lsn))?;
|
||||
|
||||
// Move the initial WAL file
|
||||
fs::rename(
|
||||
tmppath.join("pg_wal").join("000000010000000000000001"),
|
||||
timelinedir
|
||||
.join("wal")
|
||||
.join("000000010000000000000001.partial"),
|
||||
)?;
|
||||
println!("created initial timeline {}", tli);
|
||||
|
||||
let data = tli.to_string();
|
||||
fs::write(conf.branch_path("main"), data)?;
|
||||
println!("created main branch");
|
||||
|
||||
// Remove pg_wal
|
||||
fs::remove_dir_all(tmppath.join("pg_wal"))?;
|
||||
|
||||
// Move the data directory as an initial base backup.
|
||||
// FIXME: It would be enough to only copy the non-relation files here, the relation
|
||||
// data was already loaded into the repository.
|
||||
let target = timelinedir.join("snapshots").join(&lsnstr);
|
||||
fs::rename(tmppath, &target)?;
|
||||
|
||||
println!(
|
||||
"new zenith repository was created in {}",
|
||||
repo_dir.display()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_branches(conf: &PageServerConf) -> Result<Vec<BranchInfo>> {
|
||||
let repo = page_cache::get_repository();
|
||||
|
||||
// Each branch has a corresponding record (text file) in the refs/branches
|
||||
// with timeline_id.
|
||||
let branches_dir = std::path::Path::new("refs").join("branches");
|
||||
|
||||
std::fs::read_dir(&branches_dir)?
|
||||
.map(|dir_entry_res| {
|
||||
let dir_entry = dir_entry_res?;
|
||||
let name = dir_entry.file_name().to_str().unwrap().to_string();
|
||||
let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
|
||||
|
||||
let latest_valid_lsn = repo
|
||||
.get_timeline(timeline_id)
|
||||
.map(|timeline| timeline.get_last_valid_lsn())
|
||||
.ok();
|
||||
|
||||
let ancestor_path = conf.ancestor_path(timeline_id);
|
||||
let mut ancestor_id: Option<String> = None;
|
||||
let mut ancestor_lsn: Option<String> = None;
|
||||
|
||||
if ancestor_path.exists() {
|
||||
let ancestor = std::fs::read_to_string(ancestor_path)?;
|
||||
let mut strings = ancestor.split('@');
|
||||
|
||||
ancestor_id = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
ancestor_lsn = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(BranchInfo {
|
||||
name,
|
||||
timeline_id,
|
||||
latest_valid_lsn,
|
||||
ancestor_id,
|
||||
ancestor_lsn,
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub(crate) fn get_system_id(conf: &PageServerConf) -> Result<u64> {
|
||||
// let branches = get_branches();
|
||||
|
||||
let branches_dir = std::path::Path::new("refs").join("branches");
|
||||
let branches = std::fs::read_dir(&branches_dir)?
|
||||
.map(|dir_entry_res| {
|
||||
let dir_entry = dir_entry_res?;
|
||||
let name = dir_entry.file_name().to_str().unwrap().to_string();
|
||||
let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
|
||||
Ok((name, timeline_id))
|
||||
})
|
||||
.collect::<Result<HashMap<String, ZTimelineId>>>()?;
|
||||
|
||||
let main_tli = branches
|
||||
.get("main")
|
||||
.ok_or_else(|| anyhow!("Branch main not found"))?;
|
||||
|
||||
let (_, main_snap_dir) = find_latest_snapshot(conf, *main_tli)?;
|
||||
let controlfile_path = main_snap_dir.join("global").join("pg_control");
|
||||
let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
|
||||
Ok(controlfile.system_identifier)
|
||||
}
|
||||
|
||||
pub(crate) fn create_branch(
|
||||
conf: &PageServerConf,
|
||||
branchname: &str,
|
||||
startpoint_str: &str,
|
||||
) -> Result<BranchInfo> {
|
||||
let repo = page_cache::get_repository();
|
||||
|
||||
if conf.branch_path(&branchname).exists() {
|
||||
anyhow::bail!("branch {} already exists", branchname);
|
||||
}
|
||||
|
||||
let mut startpoint = parse_point_in_time(conf, startpoint_str)?;
|
||||
|
||||
if startpoint.lsn == Lsn(0) {
|
||||
// Find end of WAL on the old timeline
|
||||
let end_of_wal = repo
|
||||
.get_timeline(startpoint.timelineid)?
|
||||
.get_last_record_lsn();
|
||||
println!("branching at end of WAL: {}", end_of_wal);
|
||||
startpoint.lsn = end_of_wal;
|
||||
}
|
||||
|
||||
// create a new timeline directory for it
|
||||
let newtli = create_timeline(conf, Some(startpoint))?;
|
||||
let newtimelinedir = conf.timeline_path(newtli);
|
||||
|
||||
// Let the Repository backend do its initialization
|
||||
repo.branch_timeline(startpoint.timelineid, newtli, startpoint.lsn)?;
|
||||
|
||||
// Copy the latest snapshot (TODO: before the startpoint) and all WAL
|
||||
// TODO: be smarter and avoid the copying...
|
||||
let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(conf, startpoint.timelineid)?;
|
||||
let copy_opts = fs_extra::dir::CopyOptions::new();
|
||||
fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), ©_opts)?;
|
||||
|
||||
let oldtimelinedir = conf.timeline_path(startpoint.timelineid);
|
||||
copy_wal(
|
||||
&oldtimelinedir.join("wal"),
|
||||
&newtimelinedir.join("wal"),
|
||||
startpoint.lsn,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
)?;
|
||||
|
||||
// Remember the human-readable branch name for the new timeline.
|
||||
// FIXME: there's a race condition, if you create a branch with the same
|
||||
// name concurrently.
|
||||
let data = newtli.to_string();
|
||||
fs::write(conf.branch_path(&branchname), data)?;
|
||||
|
||||
Ok(BranchInfo {
|
||||
name: branchname.to_string(),
|
||||
timeline_id: newtli,
|
||||
latest_valid_lsn: Some(startpoint.lsn),
|
||||
ancestor_id: None,
|
||||
ancestor_lsn: None,
|
||||
})
|
||||
}
|
||||
|
||||
//
|
||||
// Parse user-given string that represents a point-in-time.
|
||||
//
|
||||
// We support multiple variants:
|
||||
//
|
||||
// Raw timeline id in hex, meaning the end of that timeline:
|
||||
// bc62e7d612d0e6fe8f99a6dd2f281f9d
|
||||
//
|
||||
// A specific LSN on a timeline:
|
||||
// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8
|
||||
//
|
||||
// Same, with a human-friendly branch name:
|
||||
// main
|
||||
// main@2/15D3DD8
|
||||
//
|
||||
// Human-friendly tag name:
|
||||
// mytag
|
||||
//
|
||||
//
|
||||
fn parse_point_in_time(conf: &PageServerConf, s: &str) -> Result<PointInTime> {
|
||||
let mut strings = s.split('@');
|
||||
let name = strings.next().unwrap();
|
||||
|
||||
let lsn: Option<Lsn>;
|
||||
if let Some(lsnstr) = strings.next() {
|
||||
lsn = Some(
|
||||
Lsn::from_str(lsnstr).with_context(|| "invalid LSN in point-in-time specification")?,
|
||||
);
|
||||
} else {
|
||||
lsn = None
|
||||
}
|
||||
|
||||
// Check if it's a tag
|
||||
if lsn.is_none() {
|
||||
let tagpath = conf.tag_path(name);
|
||||
if tagpath.exists() {
|
||||
let pointstr = fs::read_to_string(tagpath)?;
|
||||
|
||||
return parse_point_in_time(conf, &pointstr);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if it's a branch
|
||||
// Check if it's branch @ LSN
|
||||
let branchpath = conf.branch_path(name);
|
||||
if branchpath.exists() {
|
||||
let pointstr = fs::read_to_string(branchpath)?;
|
||||
|
||||
let mut result = parse_point_in_time(conf, &pointstr)?;
|
||||
|
||||
result.lsn = lsn.unwrap_or(Lsn(0));
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
// Check if it's a timelineid
|
||||
// Check if it's timelineid @ LSN
|
||||
if let Ok(timelineid) = ZTimelineId::from_str(name) {
|
||||
let tlipath = conf.timeline_path(timelineid);
|
||||
if tlipath.exists() {
|
||||
return Ok(PointInTime {
|
||||
timelineid,
|
||||
lsn: lsn.unwrap_or(Lsn(0)),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
bail!("could not parse point-in-time {}", s);
|
||||
}
|
||||
|
||||
fn create_timeline(conf: &PageServerConf, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
|
||||
// Create initial timeline
|
||||
let mut tli_buf = [0u8; 16];
|
||||
rand::thread_rng().fill(&mut tli_buf);
|
||||
let timelineid = ZTimelineId::from(tli_buf);
|
||||
|
||||
let timelinedir = conf.timeline_path(timelineid);
|
||||
|
||||
fs::create_dir(&timelinedir)?;
|
||||
fs::create_dir(&timelinedir.join("snapshots"))?;
|
||||
fs::create_dir(&timelinedir.join("wal"))?;
|
||||
|
||||
if let Some(ancestor) = ancestor {
|
||||
let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
|
||||
fs::write(timelinedir.join("ancestor"), data)?;
|
||||
}
|
||||
|
||||
Ok(timelineid)
|
||||
}
|
||||
|
||||
///
|
||||
/// Copy all WAL segments from one directory to another, up to given LSN.
|
||||
///
|
||||
/// If the given LSN is in the middle of a segment, the last segment containing it
|
||||
/// is written out as .partial, and padded with zeros.
|
||||
///
|
||||
fn copy_wal(src_dir: &Path, dst_dir: &Path, upto: Lsn, wal_seg_size: usize) -> Result<()> {
|
||||
let last_segno = upto.segment_number(wal_seg_size);
|
||||
let last_segoff = upto.segment_offset(wal_seg_size);
|
||||
|
||||
for entry in fs::read_dir(src_dir).unwrap().flatten() {
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
|
||||
// Check if the filename looks like an xlog file, or a .partial file.
|
||||
if !xlog_utils::IsXLogFileName(fname) && !xlog_utils::IsPartialXLogFileName(fname) {
|
||||
continue;
|
||||
}
|
||||
let (segno, _tli) = xlog_utils::XLogFromFileName(fname, wal_seg_size as usize);
|
||||
|
||||
let copylen;
|
||||
let mut dst_fname = PathBuf::from(fname);
|
||||
if segno > last_segno {
|
||||
// future segment, skip
|
||||
continue;
|
||||
} else if segno < last_segno {
|
||||
copylen = wal_seg_size;
|
||||
dst_fname.set_extension("");
|
||||
} else {
|
||||
copylen = last_segoff;
|
||||
dst_fname.set_extension("partial");
|
||||
}
|
||||
|
||||
let src_file = File::open(entry.path())?;
|
||||
let mut dst_file = File::create(dst_dir.join(&dst_fname))?;
|
||||
std::io::copy(&mut src_file.take(copylen as u64), &mut dst_file)?;
|
||||
|
||||
if copylen < wal_seg_size {
|
||||
std::io::copy(
|
||||
&mut std::io::repeat(0).take((wal_seg_size - copylen) as u64),
|
||||
&mut dst_file,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Find the latest snapshot for a timeline
|
||||
fn find_latest_snapshot(conf: &PageServerConf, timeline: ZTimelineId) -> Result<(Lsn, PathBuf)> {
|
||||
let snapshotsdir = conf.snapshots_path(timeline);
|
||||
let paths = fs::read_dir(&snapshotsdir)?;
|
||||
let mut maxsnapshot = Lsn(0);
|
||||
let mut snapshotdir: Option<PathBuf> = None;
|
||||
for path in paths {
|
||||
let path = path?;
|
||||
let filename = path.file_name().to_str().unwrap().to_owned();
|
||||
if let Ok(lsn) = Lsn::from_hex(&filename) {
|
||||
maxsnapshot = std::cmp::max(lsn, maxsnapshot);
|
||||
snapshotdir = Some(path.path());
|
||||
}
|
||||
}
|
||||
if maxsnapshot == Lsn(0) {
|
||||
// TODO: check ancestor timeline
|
||||
anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
|
||||
}
|
||||
|
||||
Ok((maxsnapshot, snapshotdir.unwrap()))
|
||||
}
|
||||
345
pageserver/src/inmem_storage.rs
Normal file
345
pageserver/src/inmem_storage.rs
Normal file
@@ -0,0 +1,345 @@
|
||||
//!
|
||||
//! An implementation of the ObjectStore interface, backed by BTreeMap
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::object_store::ObjectStore;
|
||||
use crate::repository::RelTag;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use std::ops::Bound::*;
|
||||
use std::sync::RwLock;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize)]
|
||||
pub struct StorageKey {
|
||||
obj_key: ObjectKey,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl StorageKey {
|
||||
/// The first key for a given timeline
|
||||
fn timeline_start(timeline: ZTimelineId) -> Self {
|
||||
Self {
|
||||
obj_key: ObjectKey {
|
||||
timeline,
|
||||
tag: ObjectTag::FirstTag,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct InmemObjectStore {
|
||||
conf: &'static PageServerConf,
|
||||
db: RwLock<BTreeMap<StorageKey, Vec<u8>>>,
|
||||
}
|
||||
|
||||
impl ObjectStore for InmemObjectStore {
|
||||
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>> {
|
||||
let db = self.db.read().unwrap();
|
||||
let val = db.get(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
});
|
||||
if let Some(val) = val {
|
||||
Ok(val.clone())
|
||||
} else {
|
||||
bail!("could not find page {:?}", key);
|
||||
}
|
||||
}
|
||||
|
||||
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
|
||||
let search_key = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let db = self.db.read().unwrap();
|
||||
for pair in db.range(&search_key..) {
|
||||
let key = pair.0;
|
||||
return Ok(Some(key.obj_key.clone()));
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
|
||||
let mut db = self.db.write().unwrap();
|
||||
db.insert(
|
||||
StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
},
|
||||
value.to_vec(),
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()> {
|
||||
let mut db = self.db.write().unwrap();
|
||||
db.remove(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Iterate through page versions of given page, starting from the given LSN.
|
||||
/// The versions are walked in descending LSN order.
|
||||
fn object_versions<'a>(
|
||||
&'a self,
|
||||
key: &ObjectKey,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>> {
|
||||
let from = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let till = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
};
|
||||
let db = self.db.read().unwrap();
|
||||
let versions: Vec<(Lsn, Vec<u8>)> = db
|
||||
.range(from..=till)
|
||||
.map(|pair| (pair.0.lsn, pair.1.clone()))
|
||||
.collect();
|
||||
Ok(Box::new(InmemObjectVersionIter::new(versions)))
|
||||
}
|
||||
|
||||
/// Iterate through all timeline objects
|
||||
fn list_objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
|
||||
let curr_key = StorageKey::timeline_start(timeline);
|
||||
|
||||
Ok(Box::new(InmemObjectIter {
|
||||
store: &self,
|
||||
curr_key,
|
||||
timeline,
|
||||
nonrel_only,
|
||||
lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Get a list of all distinct relations in given tablespace and database.
|
||||
///
|
||||
/// TODO: This implementation is very inefficient, it scans
|
||||
/// through all entries in the given database. In practice, this
|
||||
/// is used for CREATE DATABASE, and usually the template database is small.
|
||||
/// But if it's not, this will be slow.
|
||||
fn list_rels(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashSet<RelTag>> {
|
||||
// FIXME: This scans everything. Very slow
|
||||
|
||||
let mut rels: HashSet<RelTag> = HashSet::new();
|
||||
|
||||
let mut search_rel_tag = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: 0,
|
||||
forknum: 0u8,
|
||||
};
|
||||
let db = self.db.read().unwrap();
|
||||
'outer: loop {
|
||||
let search_key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline: timelineid,
|
||||
tag: ObjectTag::RelationMetadata(search_rel_tag),
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
for pair in db.range(&search_key..) {
|
||||
let key = pair.0;
|
||||
|
||||
if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
|
||||
if spcnode != 0 && rel_tag.spcnode != spcnode
|
||||
|| dbnode != 0 && rel_tag.dbnode != dbnode
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
if key.lsn <= lsn {
|
||||
// visible in this snapshot
|
||||
rels.insert(rel_tag);
|
||||
}
|
||||
search_rel_tag = rel_tag;
|
||||
// skip to next relation
|
||||
// FIXME: What if relnode is u32::MAX ?
|
||||
search_rel_tag.relnode += 1;
|
||||
continue 'outer;
|
||||
} else {
|
||||
// no more relation metadata entries
|
||||
break 'outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Iterate through versions of all objects in a timeline.
|
||||
///
|
||||
/// Returns objects in increasing key-version order.
|
||||
/// Returns all versions up to and including the specified LSN.
|
||||
fn objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>> {
|
||||
let curr_key = StorageKey::timeline_start(timeline);
|
||||
|
||||
Ok(Box::new(InmemObjects {
|
||||
store: &self,
|
||||
curr_key,
|
||||
timeline,
|
||||
lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
fn compact(&self) {}
|
||||
}
|
||||
|
||||
impl Drop for InmemObjectStore {
|
||||
fn drop(&mut self) {
|
||||
let path = self.conf.workdir.join("objstore.dmp");
|
||||
let mut f = File::create(path).unwrap();
|
||||
f.write(&self.db.ser().unwrap()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
impl InmemObjectStore {
|
||||
pub fn open(conf: &'static PageServerConf) -> Result<InmemObjectStore> {
|
||||
let path = conf.workdir.join("objstore.dmp");
|
||||
let mut f = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
f.read_to_end(&mut buffer)?;
|
||||
let db = RwLock::new(BTreeMap::des(&buffer)?);
|
||||
Ok(InmemObjectStore { conf: conf, db })
|
||||
}
|
||||
|
||||
pub fn create(conf: &'static PageServerConf) -> Result<InmemObjectStore> {
|
||||
Ok(InmemObjectStore {
|
||||
conf: conf,
|
||||
db: RwLock::new(BTreeMap::new()),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `object_versions`. Returns all page versions of a given block, in
|
||||
/// reverse LSN order.
|
||||
///
|
||||
struct InmemObjectVersionIter {
|
||||
versions: Vec<(Lsn, Vec<u8>)>,
|
||||
curr: usize,
|
||||
}
|
||||
impl InmemObjectVersionIter {
|
||||
fn new(versions: Vec<(Lsn, Vec<u8>)>) -> InmemObjectVersionIter {
|
||||
let curr = versions.len();
|
||||
InmemObjectVersionIter { versions, curr }
|
||||
}
|
||||
}
|
||||
impl Iterator for InmemObjectVersionIter {
|
||||
type Item = (Lsn, Vec<u8>);
|
||||
|
||||
fn next(&mut self) -> std::option::Option<Self::Item> {
|
||||
if self.curr == 0 {
|
||||
None
|
||||
} else {
|
||||
self.curr -= 1;
|
||||
Some(self.versions[self.curr].clone())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct InmemObjects<'r> {
|
||||
store: &'r InmemObjectStore,
|
||||
curr_key: StorageKey,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl<'r> Iterator for InmemObjects<'r> {
|
||||
// TODO consider returning Box<[u8]>
|
||||
type Item = Result<(ObjectTag, Lsn, Vec<u8>)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_result().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'r> InmemObjects<'r> {
|
||||
fn next_result(&mut self) -> Result<Option<(ObjectTag, Lsn, Vec<u8>)>> {
|
||||
let db = self.store.db.read().unwrap();
|
||||
for pair in db.range((Excluded(&self.curr_key), Unbounded)) {
|
||||
let key = pair.0;
|
||||
if key.obj_key.timeline != self.timeline {
|
||||
return Ok(None);
|
||||
}
|
||||
if key.lsn > self.lsn {
|
||||
// TODO can speed up by seeking iterator
|
||||
continue;
|
||||
}
|
||||
self.curr_key = key.clone();
|
||||
let value = pair.1.clone();
|
||||
return Ok(Some((key.obj_key.tag, key.lsn, value)));
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `list_objects`. Returns all objects preceeding specified LSN
|
||||
///
|
||||
struct InmemObjectIter<'a> {
|
||||
store: &'a InmemObjectStore,
|
||||
curr_key: StorageKey,
|
||||
timeline: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for InmemObjectIter<'a> {
|
||||
type Item = ObjectTag;
|
||||
|
||||
fn next(&mut self) -> std::option::Option<Self::Item> {
|
||||
let db = self.store.db.read().unwrap();
|
||||
'outer: loop {
|
||||
for pair in db.range((Excluded(&self.curr_key), Unbounded)) {
|
||||
let key = pair.0;
|
||||
if key.obj_key.timeline != self.timeline {
|
||||
return None;
|
||||
}
|
||||
self.curr_key = key.clone();
|
||||
self.curr_key.lsn = Lsn(u64::MAX); // next seek should skip all versions
|
||||
if key.lsn <= self.lsn {
|
||||
// visible in this snapshot
|
||||
if self.nonrel_only {
|
||||
match key.obj_key.tag {
|
||||
ObjectTag::RelationMetadata(_) => return None,
|
||||
ObjectTag::RelationBuffer(_) => return None,
|
||||
_ => return Some(key.obj_key.tag),
|
||||
}
|
||||
} else {
|
||||
return Some(key.obj_key.tag);
|
||||
}
|
||||
}
|
||||
continue 'outer;
|
||||
}
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,21 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
pub mod basebackup;
|
||||
pub mod branches;
|
||||
pub mod inmem_storage;
|
||||
pub mod object_key;
|
||||
pub mod object_repository;
|
||||
pub mod object_store;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod restore_s3;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod rocksdb_storage;
|
||||
pub mod tui;
|
||||
pub mod tui_event;
|
||||
mod tui_logger;
|
||||
@@ -11,13 +23,122 @@ pub mod waldecoder;
|
||||
pub mod walreceiver;
|
||||
pub mod walredo;
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PageServerConf {
|
||||
pub data_dir: PathBuf,
|
||||
pub daemonize: bool,
|
||||
pub interactive: bool,
|
||||
pub wal_producer_connstr: Option<String>,
|
||||
pub listen_addr: SocketAddr,
|
||||
pub skip_recovery: bool,
|
||||
pub materialize: bool,
|
||||
pub listen_addr: String,
|
||||
pub gc_horizon: u64,
|
||||
pub gc_period: Duration,
|
||||
pub wal_redoers: usize,
|
||||
|
||||
// Repository directory, relative to current working directory.
|
||||
// Normally, the page server changes the current working directory
|
||||
// to the repository, and 'workdir' is always '.'. But we don't do
|
||||
// that during unit testing, because the current directory is global
|
||||
// to the process but different unit tests work on different
|
||||
// repositories.
|
||||
pub workdir: PathBuf,
|
||||
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl PageServerConf {
|
||||
//
|
||||
// Repository paths, relative to workdir.
|
||||
//
|
||||
|
||||
fn tag_path(&self, name: &str) -> PathBuf {
|
||||
self.workdir.join("refs").join("tags").join(name)
|
||||
}
|
||||
|
||||
fn branch_path(&self, name: &str) -> PathBuf {
|
||||
self.workdir.join("refs").join("branches").join(name)
|
||||
}
|
||||
|
||||
fn timeline_path(&self, timelineid: ZTimelineId) -> PathBuf {
|
||||
self.workdir.join("timelines").join(timelineid.to_string())
|
||||
}
|
||||
|
||||
fn snapshots_path(&self, timelineid: ZTimelineId) -> PathBuf {
|
||||
self.timeline_path(timelineid).join("snapshots")
|
||||
}
|
||||
|
||||
fn ancestor_path(&self, timelineid: ZTimelineId) -> PathBuf {
|
||||
self.timeline_path(timelineid).join("ancestor")
|
||||
}
|
||||
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
|
||||
pub fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
}
|
||||
|
||||
pub fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
}
|
||||
}
|
||||
|
||||
/// Zenith Timeline ID is a 128-bit random ID.
|
||||
///
|
||||
/// Zenith timeline IDs are different from PostgreSQL timeline
|
||||
/// IDs. They serve a similar purpose though: they differentiate
|
||||
/// between different "histories" of the same cluster. However,
|
||||
/// PostgreSQL timeline IDs are a bit cumbersome, because they are only
|
||||
/// 32-bits wide, and they must be in ascending order in any given
|
||||
/// timeline history. Those limitations mean that we cannot generate a
|
||||
/// new PostgreSQL timeline ID by just generating a random number. And
|
||||
/// that in turn is problematic for the "pull/push" workflow, where you
|
||||
/// have a local copy of a zenith repository, and you periodically sync
|
||||
/// the local changes with a remote server. When you work "detached"
|
||||
/// from the remote server, you cannot create a PostgreSQL timeline ID
|
||||
/// that's guaranteed to be different from all existing timelines in
|
||||
/// the remote server. For example, if two people are having a clone of
|
||||
/// the repository on their laptops, and they both create a new branch
|
||||
/// with different name. What timeline ID would they assign to their
|
||||
/// branches? If they pick the same one, and later try to push the
|
||||
/// branches to the same remote server, they will get mixed up.
|
||||
///
|
||||
/// To avoid those issues, Zenith has its own concept of timelines that
|
||||
/// is separate from PostgreSQL timelines, and doesn't have those
|
||||
/// limitations. A zenith timeline is identified by a 128-bit ID, which
|
||||
/// is usually printed out as a hex string.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
pub struct ZTimelineId([u8; 16]);
|
||||
|
||||
impl FromStr for ZTimelineId {
|
||||
type Err = hex::FromHexError;
|
||||
|
||||
fn from_str(s: &str) -> Result<ZTimelineId, Self::Err> {
|
||||
let timelineid = hex::decode(s)?;
|
||||
|
||||
let mut buf: [u8; 16] = [0u8; 16];
|
||||
buf.copy_from_slice(timelineid.as_slice());
|
||||
Ok(ZTimelineId(buf))
|
||||
}
|
||||
}
|
||||
|
||||
impl ZTimelineId {
|
||||
pub fn from(b: [u8; 16]) -> ZTimelineId {
|
||||
ZTimelineId(b)
|
||||
}
|
||||
|
||||
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZTimelineId {
|
||||
let mut arr = [0u8; 16];
|
||||
buf.copy_to_slice(&mut arr);
|
||||
ZTimelineId::from(arr)
|
||||
}
|
||||
|
||||
pub fn as_arr(&self) -> [u8; 16] {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ZTimelineId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(&hex::encode(self.0))
|
||||
}
|
||||
}
|
||||
|
||||
84
pageserver/src/object_key.rs
Normal file
84
pageserver/src/object_key.rs
Normal file
@@ -0,0 +1,84 @@
|
||||
use crate::repository::{BufferTag, RelTag};
|
||||
use crate::waldecoder::TransactionId;
|
||||
use crate::ZTimelineId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
///
|
||||
/// ObjectKey is the key type used to identify objects stored in an object
|
||||
/// repository. It is shared between object_repository.rs and object_store.rs.
|
||||
/// It is mostly opaque to ObjectStore, it just stores and retrieves objects
|
||||
/// using the key given by the caller.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct ObjectKey {
|
||||
pub timeline: ZTimelineId,
|
||||
pub tag: ObjectTag,
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and pg_multixact)
|
||||
/// in Postgres are handled by SLRU (Simple LRU) buffer, hence the name.
|
||||
///
|
||||
/// These files are global for a postgres instance.
|
||||
///
|
||||
/// These files are divided into segments, which are divided into pages
|
||||
/// of the same BLCKSZ as used for relation files.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct SlruBufferTag {
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
///
|
||||
/// Special type of Postgres files: pg_filenode.map is needed to map
|
||||
/// catalog table OIDs to filenode numbers, which define filename.
|
||||
///
|
||||
/// Each database has a map file for its local mapped catalogs,
|
||||
/// and there is a separate map file for shared catalogs.
|
||||
///
|
||||
/// These files have untypical size of 512 bytes.
|
||||
///
|
||||
/// See PostgreSQL relmapper.c for details.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct DatabaseTag {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation files that keep state for prepared transactions.
|
||||
/// Unlike other files these are not divided into pages.
|
||||
///
|
||||
/// See PostgreSQL twophase.c for details.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct PrepareTag {
|
||||
pub xid: TransactionId,
|
||||
}
|
||||
|
||||
/// ObjectTag is a part of ObjectKey that is specific to the type of
|
||||
/// the stored object.
|
||||
///
|
||||
/// NB: the order of the enum values is significant! In particular,
|
||||
/// rocksdb_storage.rs assumes that TimelineMetadataTag is first
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum ObjectTag {
|
||||
// dummy tag preceeding all other keys
|
||||
FirstTag,
|
||||
TimelineMetadataTag,
|
||||
// Special entry that represents PostgreSQL checkpoint.
|
||||
// We use it to track fields needed to restore controlfile checkpoint.
|
||||
Checkpoint,
|
||||
// Various types of non-relation files.
|
||||
// We need them to bootstrap compute node.
|
||||
ControlFile,
|
||||
Clog(SlruBufferTag),
|
||||
MultiXactMembers(SlruBufferTag),
|
||||
MultiXactOffsets(SlruBufferTag),
|
||||
FileNodeMap(DatabaseTag),
|
||||
TwoPhase(PrepareTag),
|
||||
// put relations at the end of enum to allow efficient iterations through non-rel objects
|
||||
RelationMetadata(RelTag),
|
||||
RelationBuffer(BufferTag),
|
||||
}
|
||||
1274
pageserver/src/object_repository.rs
Normal file
1274
pageserver/src/object_repository.rs
Normal file
File diff suppressed because it is too large
Load Diff
88
pageserver/src/object_store.rs
Normal file
88
pageserver/src/object_store.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
//! Low-level key-value storage abstraction.
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::repository::RelTag;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
use std::collections::HashSet;
|
||||
use std::iter::Iterator;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Low-level storage abstraction.
|
||||
///
|
||||
/// All the data in the repository is stored in a key-value store. This trait
|
||||
/// abstracts the details of the key-value store.
|
||||
///
|
||||
/// A simple key-value store would support just GET and PUT operations with
|
||||
/// a key, but the upper layer needs slightly complicated read operations
|
||||
///
|
||||
/// The most frequently used function is 'object_versions'. It is used
|
||||
/// to look up a page version. It is LSN aware, in that the caller
|
||||
/// specifies an LSN, and the function returns all values for that
|
||||
/// block with the same or older LSN.
|
||||
///
|
||||
pub trait ObjectStore: Send + Sync {
|
||||
///
|
||||
/// Store a value with given key.
|
||||
///
|
||||
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()>;
|
||||
|
||||
/// Read entry with the exact given key.
|
||||
///
|
||||
/// This is used for retrieving metadata with special key that doesn't
|
||||
/// correspond to any real relation.
|
||||
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>>;
|
||||
|
||||
/// Read key greater or equal than specified
|
||||
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>>;
|
||||
|
||||
/// Iterate through all page versions of one object.
|
||||
///
|
||||
/// Returns all page versions in descending LSN order, along with the LSN
|
||||
/// of each page version.
|
||||
fn object_versions<'a>(
|
||||
&'a self,
|
||||
key: &ObjectKey,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>>;
|
||||
|
||||
/// Iterate through versions of all objects in a timeline.
|
||||
///
|
||||
/// Returns objects in increasing key-version order.
|
||||
/// Returns all versions up to and including the specified LSN.
|
||||
fn objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>>;
|
||||
|
||||
/// Iterate through all keys with given tablespace and database ID, and LSN <= 'lsn'.
|
||||
/// Both dbnode and spcnode can be InvalidId (0) which means get all relations in tablespace/cluster
|
||||
///
|
||||
/// This is used to implement 'create database'
|
||||
fn list_rels(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashSet<RelTag>>;
|
||||
|
||||
/// Iterate through objects tags. If nonrel_only, then only non-relationa data is iterated.
|
||||
///
|
||||
/// This is used to implement GC and preparing tarball for new node startup
|
||||
/// Returns objects in increasing key-version order.
|
||||
fn list_objects<'a>(
|
||||
&'a self,
|
||||
timelineid: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>>;
|
||||
|
||||
/// Unlink object (used by GC). This mehod may actually delete object or just mark it for deletion.
|
||||
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()>;
|
||||
|
||||
// Compact storage and remove versions marged for deletion
|
||||
fn compact(&self);
|
||||
}
|
||||
@@ -1,657 +1,37 @@
|
||||
//
|
||||
// Page Cache holds all the different page versions and WAL records
|
||||
//
|
||||
// The Page Cache is a BTreeMap, keyed by the RelFileNode an blocknumber, and the LSN.
|
||||
// The BTreeMap is protected by a Mutex, and each cache entry is protected by another
|
||||
// per-entry mutex.
|
||||
//
|
||||
//! This module acts as a switchboard to access different repositories managed by this
|
||||
//! page server. Currently, a Page Server can only manage one repository, so there
|
||||
//! isn't much here. If we implement multi-tenancy, this will probably be changed into
|
||||
//! a hash map, keyed by the tenant ID.
|
||||
|
||||
use core::ops::Bound::Included;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::error::Error;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, Condvar, Mutex};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::{convert::TryInto, ops::AddAssign};
|
||||
// use tokio::sync::RwLock;
|
||||
use bytes::Bytes;
|
||||
use crate::object_repository::ObjectRepository;
|
||||
use crate::repository::Repository;
|
||||
use crate::rocksdb_storage::RocksObjectStore;
|
||||
//use crate::inmem_storage::InmemObjectStore;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use rand::Rng;
|
||||
|
||||
use crate::{walredo, PageServerConf};
|
||||
|
||||
use crossbeam_channel::unbounded;
|
||||
use crossbeam_channel::{Receiver, Sender};
|
||||
|
||||
// Timeout when waiting or WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
static TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
pub struct PageCache {
|
||||
shared: Mutex<PageCacheShared>,
|
||||
|
||||
// Channel for communicating with the WAL redo process here.
|
||||
pub walredo_sender: Sender<Arc<CacheEntry>>,
|
||||
pub walredo_receiver: Receiver<Arc<CacheEntry>>,
|
||||
|
||||
valid_lsn_condvar: Condvar,
|
||||
|
||||
// Counters, for metrics collection.
|
||||
pub num_entries: AtomicU64,
|
||||
pub num_page_images: AtomicU64,
|
||||
pub num_wal_records: AtomicU64,
|
||||
pub num_getpage_requests: AtomicU64,
|
||||
|
||||
// copies of shared.first/last_valid_lsn fields (copied here so
|
||||
// that they can be read without acquiring the mutex).
|
||||
pub first_valid_lsn: AtomicU64,
|
||||
pub last_valid_lsn: AtomicU64,
|
||||
pub last_record_lsn: AtomicU64,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PageCacheStats {
|
||||
pub num_entries: u64,
|
||||
pub num_page_images: u64,
|
||||
pub num_wal_records: u64,
|
||||
pub num_getpage_requests: u64,
|
||||
pub first_valid_lsn: u64,
|
||||
pub last_valid_lsn: u64,
|
||||
pub last_record_lsn: u64,
|
||||
}
|
||||
|
||||
impl AddAssign for PageCacheStats {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
*self = Self {
|
||||
num_entries: self.num_entries + other.num_entries,
|
||||
num_page_images: self.num_page_images + other.num_page_images,
|
||||
num_wal_records: self.num_wal_records + other.num_wal_records,
|
||||
num_getpage_requests: self.num_getpage_requests + other.num_getpage_requests,
|
||||
first_valid_lsn: self.first_valid_lsn + other.first_valid_lsn,
|
||||
last_valid_lsn: self.last_valid_lsn + other.last_valid_lsn,
|
||||
last_record_lsn: self.last_record_lsn + other.last_record_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Shared data structure, holding page cache and related auxiliary information
|
||||
//
|
||||
struct PageCacheShared {
|
||||
// The actual page cache
|
||||
pagecache: BTreeMap<CacheKey, Arc<CacheEntry>>,
|
||||
|
||||
// Relation n_blocks cache
|
||||
//
|
||||
// This hashtable should be updated together with the pagecache. Now it is
|
||||
// accessed unreasonably often through the smgr_nblocks(). It is better to just
|
||||
// cache it in postgres smgr and ask only on restart.
|
||||
relsize_cache: HashMap<RelTag, u32>,
|
||||
|
||||
// What page versions do we hold in the cache? If we get GetPage with
|
||||
// LSN < first_valid_lsn, that's an error because we (no longer) hold that
|
||||
// page version. If we get a request > last_valid_lsn, we need to wait until
|
||||
// we receive all the WAL up to the request.
|
||||
//
|
||||
// last_record_lsn points to the end of last processed WAL record.
|
||||
// It can lag behind last_valid_lsn, if the WAL receiver has received some WAL
|
||||
// after the end of last record, but not the whole next record yet. In the
|
||||
// page cache, we care about last_valid_lsn, but if the WAL receiver needs to
|
||||
// restart the streaming, it needs to restart at the end of last record, so
|
||||
// we track them separately. last_record_lsn should perhaps be in
|
||||
// walreceiver.rs instead of here, but it seems convenient to keep all three
|
||||
// values together.
|
||||
//
|
||||
first_valid_lsn: u64,
|
||||
last_valid_lsn: u64,
|
||||
last_record_lsn: u64,
|
||||
}
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
lazy_static! {
|
||||
pub static ref PAGECACHES: Mutex<HashMap<u64, Arc<PageCache>>> = Mutex::new(HashMap::new());
|
||||
pub static ref REPOSITORY: Mutex<Option<Arc<dyn Repository>>> = Mutex::new(None);
|
||||
}
|
||||
|
||||
pub fn get_pagecache(conf: PageServerConf, sys_id: u64) -> Arc<PageCache> {
|
||||
let mut pcaches = PAGECACHES.lock().unwrap();
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
if !pcaches.contains_key(&sys_id) {
|
||||
pcaches.insert(sys_id, Arc::new(init_page_cache()));
|
||||
let obj_store = RocksObjectStore::open(conf).unwrap();
|
||||
//let obj_store = InmemObjectStore::open(conf).unwrap();
|
||||
|
||||
// Initialize the WAL redo thread
|
||||
//
|
||||
// Now join_handle is not saved any where and we won'try restart tharead
|
||||
// if it is dead. We may later stop that treads after some inactivity period
|
||||
// and restart them on demand.
|
||||
let _walredo_thread = thread::Builder::new()
|
||||
.name("WAL redo thread".into())
|
||||
.spawn(move || {
|
||||
walredo::wal_redo_main(conf, sys_id);
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf);
|
||||
|
||||
pcaches.get(&sys_id).unwrap().clone()
|
||||
// we have already changed current dir to the repository.
|
||||
let repo = ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr));
|
||||
|
||||
*m = Some(Arc::new(repo));
|
||||
}
|
||||
|
||||
fn init_page_cache() -> PageCache {
|
||||
// Initialize the channel between the page cache and the WAL applicator
|
||||
let (s, r) = unbounded();
|
||||
|
||||
PageCache {
|
||||
shared: Mutex::new(PageCacheShared {
|
||||
pagecache: BTreeMap::new(),
|
||||
relsize_cache: HashMap::new(),
|
||||
first_valid_lsn: 0,
|
||||
last_valid_lsn: 0,
|
||||
last_record_lsn: 0,
|
||||
}),
|
||||
valid_lsn_condvar: Condvar::new(),
|
||||
|
||||
walredo_sender: s,
|
||||
walredo_receiver: r,
|
||||
|
||||
num_entries: AtomicU64::new(0),
|
||||
num_page_images: AtomicU64::new(0),
|
||||
num_wal_records: AtomicU64::new(0),
|
||||
num_getpage_requests: AtomicU64::new(0),
|
||||
|
||||
first_valid_lsn: AtomicU64::new(0),
|
||||
last_valid_lsn: AtomicU64::new(0),
|
||||
last_record_lsn: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// We store two kinds of entries in the page cache:
|
||||
//
|
||||
// 1. Ready-made images of the block
|
||||
// 2. WAL records, to be applied on top of the "previous" entry
|
||||
//
|
||||
// Some WAL records will initialize the page from scratch. For such records,
|
||||
// the 'will_init' flag is set. They don't need the previous page image before
|
||||
// applying. The 'will_init' flag is set for records containing a full-page image,
|
||||
// and for records with the BKPBLOCK_WILL_INIT flag. These differ from PageImages
|
||||
// stored directly in the cache entry in that you still need to run the WAL redo
|
||||
// routine to generate the page image.
|
||||
//
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
pub struct CacheKey {
|
||||
pub tag: BufferTag,
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
pub struct CacheEntry {
|
||||
pub key: CacheKey,
|
||||
|
||||
pub content: Mutex<CacheEntryContent>,
|
||||
|
||||
// Condition variable used by the WAL redo service, to wake up
|
||||
// requester.
|
||||
//
|
||||
// FIXME: this takes quite a lot of space. Consider using parking_lot::Condvar
|
||||
// or something else.
|
||||
pub walredo_condvar: Condvar,
|
||||
}
|
||||
|
||||
pub struct CacheEntryContent {
|
||||
pub page_image: Option<Bytes>,
|
||||
pub wal_record: Option<WALRecord>,
|
||||
pub apply_pending: bool,
|
||||
}
|
||||
|
||||
impl CacheEntry {
|
||||
fn new(key: CacheKey) -> CacheEntry {
|
||||
CacheEntry {
|
||||
key: key,
|
||||
content: Mutex::new(CacheEntryContent {
|
||||
page_image: None,
|
||||
wal_record: None,
|
||||
apply_pending: false,
|
||||
}),
|
||||
walredo_condvar: Condvar::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Hash, Clone, Copy)]
|
||||
pub struct RelTag {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u8,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub struct BufferTag {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u8,
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: u64, // LSN at the *end* of the record
|
||||
pub will_init: bool,
|
||||
pub rec: Bytes,
|
||||
}
|
||||
|
||||
// Public interface functions
|
||||
|
||||
impl PageCache {
|
||||
//
|
||||
// GetPage@LSN
|
||||
//
|
||||
// Returns an 8k page image
|
||||
//
|
||||
pub fn get_page_at_lsn(&self, tag: BufferTag, lsn: u64) -> Result<Bytes, Box<dyn Error>> {
|
||||
self.num_getpage_requests.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// Look up cache entry. If it's a page image, return that. If it's a WAL record,
|
||||
// ask the WAL redo service to reconstruct the page image from the WAL records.
|
||||
let minkey = CacheKey { tag: tag, lsn: 0 };
|
||||
let maxkey = CacheKey { tag: tag, lsn: lsn };
|
||||
|
||||
let entry_rc: Arc<CacheEntry>;
|
||||
{
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
let mut waited = false;
|
||||
|
||||
while lsn > shared.last_valid_lsn {
|
||||
// TODO: Wait for the WAL receiver to catch up
|
||||
waited = true;
|
||||
trace!(
|
||||
"not caught up yet: {}, requested {}",
|
||||
shared.last_valid_lsn,
|
||||
lsn
|
||||
);
|
||||
let wait_result = self
|
||||
.valid_lsn_condvar
|
||||
.wait_timeout(shared, TIMEOUT)
|
||||
.unwrap();
|
||||
|
||||
shared = wait_result.0;
|
||||
if wait_result.1.timed_out() {
|
||||
return Err(format!(
|
||||
"Timed out while waiting for WAL record at LSN {:X}/{:X} to arrive",
|
||||
lsn >> 32, lsn & 0xffff_ffff
|
||||
))?;
|
||||
}
|
||||
}
|
||||
if waited {
|
||||
trace!("caught up now, continuing");
|
||||
}
|
||||
|
||||
if lsn < shared.first_valid_lsn {
|
||||
return Err(format!("LSN {:X}/{:X} has already been removed",
|
||||
lsn >> 32, lsn & 0xffff_ffff))?;
|
||||
}
|
||||
|
||||
let pagecache = &shared.pagecache;
|
||||
|
||||
let mut entries = pagecache.range((Included(&minkey), Included(&maxkey)));
|
||||
|
||||
let entry_opt = entries.next_back();
|
||||
|
||||
if entry_opt.is_none() {
|
||||
static ZERO_PAGE: [u8; 8192] = [0 as u8; 8192];
|
||||
return Ok(Bytes::from_static(&ZERO_PAGE));
|
||||
/* return Err("could not find page image")?; */
|
||||
}
|
||||
let (_key, entry) = entry_opt.unwrap();
|
||||
entry_rc = entry.clone();
|
||||
|
||||
// Now that we have a reference to the cache entry, drop the lock on the map.
|
||||
// It's important to do this before waiting on the condition variable below,
|
||||
// and better to do it as soon as possible to maximize concurrency.
|
||||
}
|
||||
|
||||
// Lock the cache entry and dig the page image out of it.
|
||||
let page_img: Bytes;
|
||||
{
|
||||
let mut entry_content = entry_rc.content.lock().unwrap();
|
||||
|
||||
if let Some(img) = &entry_content.page_image {
|
||||
assert!(!entry_content.apply_pending);
|
||||
page_img = img.clone();
|
||||
} else if entry_content.wal_record.is_some() {
|
||||
//
|
||||
// If this page needs to be reconstructed by applying some WAL,
|
||||
// send a request to the WAL redo thread.
|
||||
//
|
||||
if !entry_content.apply_pending {
|
||||
assert!(!entry_content.apply_pending);
|
||||
entry_content.apply_pending = true;
|
||||
|
||||
let s = &self.walredo_sender;
|
||||
s.send(entry_rc.clone())?;
|
||||
}
|
||||
|
||||
while entry_content.apply_pending {
|
||||
entry_content = entry_rc.walredo_condvar.wait(entry_content).unwrap();
|
||||
}
|
||||
|
||||
// We should now have a page image. If we don't, it means that WAL redo
|
||||
// failed to reconstruct it. WAL redo should've logged that error already.
|
||||
page_img = match &entry_content.page_image {
|
||||
Some(p) => p.clone(),
|
||||
None => {
|
||||
error!(
|
||||
"could not apply WAL to reconstruct page image for GetPage@LSN request"
|
||||
);
|
||||
return Err("could not apply WAL to reconstruct page image".into());
|
||||
}
|
||||
};
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
return Err(format!("no page image or WAL record for requested page"))?;
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: assumes little-endian. Only used for the debugging log though
|
||||
let page_lsn_hi = u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
|
||||
let page_lsn_lo = u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap());
|
||||
trace!(
|
||||
"Returning page with LSN {:X}/{:X} for {}/{}/{}.{} blk {}",
|
||||
page_lsn_hi,
|
||||
page_lsn_lo,
|
||||
tag.spcnode,
|
||||
tag.dbnode,
|
||||
tag.relnode,
|
||||
tag.forknum,
|
||||
tag.blknum
|
||||
);
|
||||
|
||||
return Ok(page_img);
|
||||
}
|
||||
|
||||
//
|
||||
// Collect all the WAL records that are needed to reconstruct a page
|
||||
// image for the given cache entry.
|
||||
//
|
||||
// Returns an old page image (if any), and a vector of WAL records to apply
|
||||
// over it.
|
||||
//
|
||||
pub fn collect_records_for_apply(&self, entry: &CacheEntry) -> (Option<Bytes>, Vec<WALRecord>) {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let shared = self.shared.lock().unwrap();
|
||||
let pagecache = &shared.pagecache;
|
||||
|
||||
let minkey = CacheKey {
|
||||
tag: entry.key.tag,
|
||||
lsn: 0,
|
||||
};
|
||||
let maxkey = CacheKey {
|
||||
tag: entry.key.tag,
|
||||
lsn: entry.key.lsn,
|
||||
};
|
||||
let entries = pagecache.range((Included(&minkey), Included(&maxkey)));
|
||||
|
||||
// the last entry in the range should be the CacheEntry we were given
|
||||
//let _last_entry = entries.next_back();
|
||||
//assert!(last_entry == entry);
|
||||
|
||||
let mut base_img: Option<Bytes> = None;
|
||||
let mut records: Vec<WALRecord> = Vec::new();
|
||||
|
||||
// Scan backwards, collecting the WAL records, until we hit an
|
||||
// old page image.
|
||||
for (_key, e) in entries.rev() {
|
||||
let e = e.content.lock().unwrap();
|
||||
|
||||
if let Some(img) = &e.page_image {
|
||||
// We have a base image. No need to dig deeper into the list of
|
||||
// records
|
||||
base_img = Some(img.clone());
|
||||
break;
|
||||
} else if let Some(rec) = &e.wal_record {
|
||||
records.push(rec.clone());
|
||||
|
||||
// If this WAL record initializes the page, no need to dig deeper.
|
||||
if rec.will_init {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
panic!("no base image and no WAL record on cache entry");
|
||||
}
|
||||
}
|
||||
|
||||
records.reverse();
|
||||
return (base_img, records);
|
||||
}
|
||||
|
||||
//
|
||||
// Adds a WAL record to the page cache
|
||||
//
|
||||
pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) {
|
||||
let key = CacheKey {
|
||||
tag: tag,
|
||||
lsn: rec.lsn,
|
||||
};
|
||||
|
||||
let entry = CacheEntry::new(key.clone());
|
||||
entry.content.lock().unwrap().wal_record = Some(rec);
|
||||
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
let rel_tag = RelTag {
|
||||
spcnode: tag.spcnode,
|
||||
dbnode: tag.dbnode,
|
||||
relnode: tag.relnode,
|
||||
forknum: tag.forknum,
|
||||
};
|
||||
let rel_entry = shared.relsize_cache.entry(rel_tag).or_insert(0);
|
||||
if tag.blknum >= *rel_entry {
|
||||
*rel_entry = tag.blknum + 1;
|
||||
}
|
||||
|
||||
trace!("put_wal_record lsn: {}", key.lsn);
|
||||
|
||||
let oldentry = shared.pagecache.insert(key, Arc::new(entry));
|
||||
self.num_entries.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
if !oldentry.is_none() {
|
||||
error!("overwriting WAL record in page cache");
|
||||
}
|
||||
|
||||
self.num_wal_records.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
//
|
||||
// Memorize a full image of a page version
|
||||
//
|
||||
pub fn put_page_image(&self, tag: BufferTag, lsn: u64, img: Bytes) {
|
||||
let key = CacheKey { tag: tag, lsn: lsn };
|
||||
|
||||
let entry = CacheEntry::new(key.clone());
|
||||
entry.content.lock().unwrap().page_image = Some(img);
|
||||
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
let pagecache = &mut shared.pagecache;
|
||||
|
||||
let oldentry = pagecache.insert(key, Arc::new(entry));
|
||||
self.num_entries.fetch_add(1, Ordering::Relaxed);
|
||||
assert!(oldentry.is_none());
|
||||
|
||||
//debug!("inserted page image for {}/{}/{}_{} blk {} at {}",
|
||||
// tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum, lsn);
|
||||
|
||||
self.num_page_images.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
//
|
||||
pub fn advance_last_valid_lsn(&self, lsn: u64) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
// Can't move backwards.
|
||||
assert!(lsn >= shared.last_valid_lsn);
|
||||
|
||||
shared.last_valid_lsn = lsn;
|
||||
self.valid_lsn_condvar.notify_all();
|
||||
|
||||
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
//
|
||||
// NOTE: this updates last_valid_lsn as well.
|
||||
//
|
||||
pub fn advance_last_record_lsn(&self, lsn: u64) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
// Can't move backwards.
|
||||
assert!(lsn >= shared.last_valid_lsn);
|
||||
assert!(lsn >= shared.last_record_lsn);
|
||||
|
||||
shared.last_valid_lsn = lsn;
|
||||
shared.last_record_lsn = lsn;
|
||||
self.valid_lsn_condvar.notify_all();
|
||||
|
||||
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
//
|
||||
pub fn _advance_first_valid_lsn(&self, lsn: u64) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
// Can't move backwards.
|
||||
assert!(lsn >= shared.first_valid_lsn);
|
||||
|
||||
// Can't overtake last_valid_lsn (except when we're
|
||||
// initializing the system and last_valid_lsn hasn't been set yet.
|
||||
assert!(shared.last_valid_lsn == 0 || lsn < shared.last_valid_lsn);
|
||||
|
||||
shared.first_valid_lsn = lsn;
|
||||
self.first_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn init_valid_lsn(&self, lsn: u64) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
assert!(shared.first_valid_lsn == 0);
|
||||
assert!(shared.last_valid_lsn == 0);
|
||||
assert!(shared.last_record_lsn == 0);
|
||||
|
||||
shared.first_valid_lsn = lsn;
|
||||
shared.last_valid_lsn = lsn;
|
||||
shared.last_record_lsn = lsn;
|
||||
|
||||
self.first_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
self.last_record_lsn.store(lsn, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn get_last_valid_lsn(&self) -> u64 {
|
||||
let shared = self.shared.lock().unwrap();
|
||||
|
||||
return shared.last_record_lsn;
|
||||
}
|
||||
|
||||
//
|
||||
// Simple test function for the WAL redo code:
|
||||
//
|
||||
// 1. Pick a page from the page cache at random.
|
||||
// 2. Request that page with GetPage@LSN, using Max LSN (i.e. get the latest page version)
|
||||
//
|
||||
//
|
||||
pub fn _test_get_page_at_lsn(&self) {
|
||||
// for quick testing of the get_page_at_lsn() funcion.
|
||||
//
|
||||
// Get a random page from the page cache. Apply all its WAL, by requesting
|
||||
// that page at the highest lsn.
|
||||
|
||||
let mut tag: Option<BufferTag> = None;
|
||||
|
||||
{
|
||||
let shared = self.shared.lock().unwrap();
|
||||
let pagecache = &shared.pagecache;
|
||||
|
||||
if pagecache.is_empty() {
|
||||
info!("page cache is empty");
|
||||
return;
|
||||
}
|
||||
|
||||
// Find nth entry in the map, where n is picked at random
|
||||
let n = rand::thread_rng().gen_range(0..pagecache.len());
|
||||
let mut i = 0;
|
||||
for (key, _e) in pagecache.iter() {
|
||||
if i == n {
|
||||
tag = Some(key.tag);
|
||||
break;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
info!("testing GetPage@LSN for block {}", tag.unwrap().blknum);
|
||||
match self.get_page_at_lsn(tag.unwrap(), 0xffff_ffff_ffff_eeee) {
|
||||
Ok(_img) => {
|
||||
// This prints out the whole page image.
|
||||
//println!("{:X?}", img);
|
||||
}
|
||||
Err(error) => {
|
||||
error!("GetPage@LSN failed: {}", error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: Shouldn't relation size also be tracked with an LSN?
|
||||
// If a replica is lagging behind, it needs to get the size as it was on
|
||||
// the replica's current replay LSN.
|
||||
pub fn relsize_inc(&self, rel: &RelTag, to: Option<u32>) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
let entry = shared.relsize_cache.entry(*rel).or_insert(0);
|
||||
|
||||
if let Some(to) = to {
|
||||
if to >= *entry {
|
||||
*entry = to + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn relsize_get(&self, rel: &RelTag) -> u32 {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
let entry = shared.relsize_cache.entry(*rel).or_insert(0);
|
||||
*entry
|
||||
}
|
||||
|
||||
pub fn relsize_exist(&self, rel: &RelTag) -> bool {
|
||||
let shared = self.shared.lock().unwrap();
|
||||
let relsize_cache = &shared.relsize_cache;
|
||||
relsize_cache.contains_key(rel)
|
||||
}
|
||||
|
||||
pub fn get_stats(&self) -> PageCacheStats {
|
||||
PageCacheStats {
|
||||
num_entries: self.num_entries.load(Ordering::Relaxed),
|
||||
num_page_images: self.num_page_images.load(Ordering::Relaxed),
|
||||
num_wal_records: self.num_wal_records.load(Ordering::Relaxed),
|
||||
num_getpage_requests: self.num_getpage_requests.load(Ordering::Relaxed),
|
||||
first_valid_lsn: self.first_valid_lsn.load(Ordering::Relaxed),
|
||||
last_valid_lsn: self.last_valid_lsn.load(Ordering::Relaxed),
|
||||
last_record_lsn: self.last_record_lsn.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_stats() -> PageCacheStats {
|
||||
let pcaches = PAGECACHES.lock().unwrap();
|
||||
|
||||
let mut stats = PageCacheStats {
|
||||
num_entries: 0,
|
||||
num_page_images: 0,
|
||||
num_wal_records: 0,
|
||||
num_getpage_requests: 0,
|
||||
first_valid_lsn: 0,
|
||||
last_valid_lsn: 0,
|
||||
last_record_lsn: 0,
|
||||
};
|
||||
|
||||
pcaches.iter().for_each(|(_sys_id, pcache)| {
|
||||
stats += pcache.get_stats();
|
||||
});
|
||||
stats
|
||||
pub fn get_repository() -> Arc<dyn Repository> {
|
||||
let o = &REPOSITORY.lock().unwrap();
|
||||
Arc::clone(o.as_ref().unwrap())
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
643
pageserver/src/repository.rs
Normal file
643
pageserver/src/repository.rs
Normal file
@@ -0,0 +1,643 @@
|
||||
use crate::object_key::*;
|
||||
use crate::waldecoder::TransactionId;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use postgres_ffi::nonrelfile_utils::transaction_id_get_status;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::iter::Iterator;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// A repository corresponds to one .zenith directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
pub trait Repository: Send + Sync {
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Create a new, empty timeline. The caller is responsible for loading data into it
|
||||
fn create_empty_timeline(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
start_lsn: Lsn,
|
||||
) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
|
||||
|
||||
//fn get_stats(&self) -> RepositoryStats;
|
||||
}
|
||||
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default)]
|
||||
pub struct GcResult {
|
||||
pub n_relations: u64,
|
||||
pub inspected: u64,
|
||||
pub truncated: u64,
|
||||
pub deleted: u64,
|
||||
pub prep_deleted: u64, // 2PC prepare
|
||||
pub slru_deleted: u64, // SLRU (clog, multixact)
|
||||
pub chkp_deleted: u64, // Checkpoints
|
||||
pub dropped: u64,
|
||||
pub elapsed: Duration,
|
||||
}
|
||||
|
||||
pub trait Timeline: Send + Sync {
|
||||
//------------------------------------------------------------------------------
|
||||
// Public GET functions
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn(&self, tag: ObjectTag, lsn: Lsn) -> Result<Bytes>;
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn_nowait(&self, tag: ObjectTag, lsn: Lsn, materialize: bool) -> Result<Bytes>;
|
||||
|
||||
/// Get size of relation
|
||||
fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<u32>;
|
||||
|
||||
/// Does relation exist?
|
||||
fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool>;
|
||||
|
||||
/// Get a list of all distinct relations in given tablespace and database.
|
||||
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>>;
|
||||
|
||||
/// Get a list of non-relational objects
|
||||
fn list_nonrels<'a>(&'a self, lsn: Lsn) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>>;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Public PUT functions, to update the repository with new page versions.
|
||||
//
|
||||
// These are called by the WAL receiver to digest WAL records.
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, tag: ObjectTag, rec: WALRecord) -> Result<()>;
|
||||
|
||||
/// Put raw data
|
||||
fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()>;
|
||||
|
||||
/// Like put_wal_record, but with ready-made image of the page.
|
||||
fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes, update_meta: bool)
|
||||
-> Result<()>;
|
||||
|
||||
/// Truncate relation
|
||||
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()>;
|
||||
|
||||
/// Unlink relation. This method is used for marking dropped relations.
|
||||
fn put_unlink(&self, tag: RelTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Truncate SLRU segment
|
||||
fn put_slru_truncate(&self, tag: ObjectTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
// Get object tag greater or equal than specified
|
||||
fn get_next_tag(&self, tag: ObjectTag) -> Result<Option<ObjectTag>>;
|
||||
|
||||
/// Remember the all WAL before the given LSN has been processed.
|
||||
///
|
||||
/// The WAL receiver calls this after the put_* functions, to indicate that
|
||||
/// all WAL before this point has been digested. Before that, if you call
|
||||
/// GET on an earlier LSN, it will block.
|
||||
fn advance_last_valid_lsn(&self, lsn: Lsn);
|
||||
fn get_last_valid_lsn(&self) -> Lsn;
|
||||
fn init_valid_lsn(&self, lsn: Lsn);
|
||||
|
||||
/// Like `advance_last_valid_lsn`, but this always points to the end of
|
||||
/// a WAL record, not in the middle of one.
|
||||
///
|
||||
/// This must be <= last valid LSN. This is tracked separately from last
|
||||
/// valid LSN, so that the WAL receiver knows where to restart streaming.
|
||||
fn advance_last_record_lsn(&self, lsn: Lsn);
|
||||
fn get_last_record_lsn(&self) -> Lsn;
|
||||
|
||||
// Like `advance_last_record_lsn`, but points to the start position of last record
|
||||
fn get_prev_record_lsn(&self) -> Lsn;
|
||||
|
||||
///
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
fn checkpoint(&self) -> Result<()>;
|
||||
|
||||
/// Events for all relations in the timeline.
|
||||
/// Contains updates from start up to the last valid LSN
|
||||
/// at time of history() call. This lsn can be read via the lsn() function.
|
||||
///
|
||||
/// Relation size is increased implicitly and decreased with Truncate updates.
|
||||
// TODO ordering guarantee?
|
||||
fn history<'a>(&'a self) -> Result<Box<dyn History + 'a>>;
|
||||
|
||||
/// Perform one garbage collection iteration.
|
||||
/// Garbage collection is periodically performed by GC thread,
|
||||
/// but it can be explicitly requested through page server API.
|
||||
///
|
||||
/// `horizon` specifies delta from last LSN to preserve all object versions (PITR interval).
|
||||
/// `compact` parameter is used to force compaction of storage.
|
||||
/// Some storage implementation are based on LSM tree and require periodic merge (compaction).
|
||||
/// Usually storage implementation determines itself when compaction should be performed.
|
||||
/// But for GC tests it way be useful to force compaction just after completion of GC iteration
|
||||
/// to make sure that all detected garbage is removed.
|
||||
/// So right now `compact` is set to true when GC explicitly requested through page srver API,
|
||||
/// and is st to false in GC threads which infinitely repeats GC iterations in loop.
|
||||
fn gc_iteration(&self, horizon: u64, compact: bool) -> Result<GcResult>;
|
||||
|
||||
// Check transaction status
|
||||
fn get_tx_status(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result<u8> {
|
||||
let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
let clog_page = self.get_page_at_lsn(tag, lsn)?;
|
||||
let status = transaction_id_get_status(xid, &clog_page[..]);
|
||||
Ok(status)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait History: Iterator<Item = Result<RelationUpdate>> {
|
||||
/// The last_valid_lsn at the time of history() call.
|
||||
fn lsn(&self) -> Lsn;
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct RelationUpdate {
|
||||
pub rel: RelTag,
|
||||
pub lsn: Lsn,
|
||||
pub update: Update,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum Update {
|
||||
Page { blknum: u32, img: Bytes },
|
||||
WALRecord { blknum: u32, rec: WALRecord },
|
||||
Truncate { n_blocks: u32 },
|
||||
Unlink,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RepositoryStats {
|
||||
pub num_entries: Lsn,
|
||||
pub num_page_images: Lsn,
|
||||
pub num_wal_records: Lsn,
|
||||
pub num_getpage_requests: Lsn,
|
||||
}
|
||||
|
||||
///
|
||||
/// Relation data file segment id throughout the Postgres cluster.
|
||||
///
|
||||
/// Every data file in Postgres is uniquely identified by 4 numbers:
|
||||
/// - relation id / node (`relnode`)
|
||||
/// - database id (`dbnode`)
|
||||
/// - tablespace id (`spcnode`), in short this is a unique id of a separate
|
||||
/// directory to store data files.
|
||||
/// - forknumber (`forknum`) is used to split different kinds of data of the same relation
|
||||
/// between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`).
|
||||
///
|
||||
/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value
|
||||
/// are used for the same purpose.
|
||||
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
}
|
||||
|
||||
impl RelTag {
|
||||
pub const ZEROED: Self = Self {
|
||||
forknum: 0,
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
};
|
||||
}
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
///
|
||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||
///
|
||||
impl fmt::Display for RelTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if let Some(forkname) = forknumber_to_name(self.forknum) {
|
||||
write!(
|
||||
f,
|
||||
"{}/{}/{}_{}",
|
||||
self.spcnode, self.dbnode, self.relnode, forkname
|
||||
)
|
||||
} else {
|
||||
write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
|
||||
/// This is used as a part of the key inside key-value storage (RocksDB currently).
|
||||
///
|
||||
/// In Postgres `BufferTag` structure is used for exactly the same purpose.
|
||||
/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct BufferTag {
|
||||
pub rel: RelTag,
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
impl BufferTag {
|
||||
pub const ZEROED: Self = Self {
|
||||
rel: RelTag::ZEROED,
|
||||
blknum: 0,
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: Lsn, // LSN at the *end* of the record
|
||||
pub will_init: bool,
|
||||
pub rec: Bytes,
|
||||
// Remember the offset of main_data in rec,
|
||||
// so that we don't have to parse the record again.
|
||||
// If record has no main_data, this offset equals rec.len().
|
||||
pub main_data_offset: u32,
|
||||
}
|
||||
|
||||
impl WALRecord {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
buf.put_u64(self.lsn.0);
|
||||
buf.put_u8(self.will_init as u8);
|
||||
buf.put_u32(self.main_data_offset);
|
||||
buf.put_u32(self.rec.len() as u32);
|
||||
buf.put_slice(&self.rec[..]);
|
||||
}
|
||||
pub fn unpack(buf: &mut Bytes) -> WALRecord {
|
||||
let lsn = Lsn::from(buf.get_u64());
|
||||
let will_init = buf.get_u8() != 0;
|
||||
let main_data_offset = buf.get_u32();
|
||||
let mut dst = vec![0u8; buf.get_u32() as usize];
|
||||
buf.copy_to_slice(&mut dst);
|
||||
WALRecord {
|
||||
lsn,
|
||||
will_init,
|
||||
rec: Bytes::from(dst),
|
||||
main_data_offset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Tests that should work the same with any Repository/Timeline implementation.
|
||||
///
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::object_repository::ObjectRepository;
|
||||
use crate::rocksdb_storage::RocksObjectStore;
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::PageServerConf;
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelTag = RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
relnode: 1000,
|
||||
forknum: 0,
|
||||
};
|
||||
const TESTREL_B: RelTag = RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
relnode: 1001,
|
||||
forknum: 0,
|
||||
};
|
||||
|
||||
/// Convenience function to create a BufferTag for testing.
|
||||
/// Helps to keeps the tests shorter.
|
||||
#[allow(non_snake_case)]
|
||||
fn TEST_BUF(blknum: u32) -> ObjectTag {
|
||||
ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: TESTREL_A,
|
||||
blknum,
|
||||
})
|
||||
}
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
fn TEST_IMG(s: &str) -> Bytes {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
buf.resize(8192, 0);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
|
||||
let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
|
||||
let conf = PageServerConf {
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
materialize: false,
|
||||
gc_horizon: 64 * 1024 * 1024,
|
||||
gc_period: Duration::from_secs(10),
|
||||
wal_redoers: 1,
|
||||
listen_addr: "127.0.0.1:5430".to_string(),
|
||||
workdir: repo_dir,
|
||||
pg_distrib_dir: "".into(),
|
||||
};
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let obj_store = RocksObjectStore::create(conf)?;
|
||||
|
||||
let walredo_mgr = TestRedoManager {};
|
||||
|
||||
let repo = ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr));
|
||||
|
||||
Ok(Box::new(repo))
|
||||
}
|
||||
|
||||
/// Test get_relsize() and truncation.
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
// get_timeline() with non-existent timeline id should fail
|
||||
//repo.get_timeline("11223344556677881122334455667788");
|
||||
|
||||
// Create timeline to work on
|
||||
let repo = get_test_repo("test_relsize")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
|
||||
tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"), true)?;
|
||||
tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"), true)?;
|
||||
|
||||
tline.advance_last_valid_lsn(Lsn(5));
|
||||
|
||||
// The relation was created at LSN 2, not visible at LSN 1 yet.
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(1))?, false);
|
||||
assert!(tline.get_rel_size(TESTREL_A, Lsn(1)).is_err());
|
||||
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(2))?, true);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(2))?, 1);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(5))?, 3);
|
||||
|
||||
// Check page contents at each LSN
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(2))?,
|
||||
TEST_IMG("foo blk 0 at 2")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(3))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(1), Lsn(4))?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(5))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(1), Lsn(5))?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
// Truncate last block
|
||||
tline.put_truncation(TESTREL_A, Lsn(6), 2)?;
|
||||
tline.advance_last_valid_lsn(Lsn(6));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(6))?, 2);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(6))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(1), Lsn(6))?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
// should still see the truncated block with older LSN
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(5))?, 3);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Test get_relsize() and truncation with a file larger than 1 GB, so that it's
|
||||
/// split into multiple 1 GB segments in Postgres.
|
||||
///
|
||||
/// This isn't very interesting with the RocksDb implementation, as we don't pay
|
||||
/// any attention to Postgres segment boundaries there.
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = get_test_repo("test_large_rel")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
|
||||
let mut lsn = 0;
|
||||
for i in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", i, Lsn(lsn)));
|
||||
lsn += 1;
|
||||
tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img, true)?;
|
||||
}
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
|
||||
assert_eq!(
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE + 1
|
||||
);
|
||||
|
||||
// Truncate one block
|
||||
lsn += 1;
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE
|
||||
);
|
||||
|
||||
// Truncate another block
|
||||
lsn += 1;
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE - 1
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Test branch creation
|
||||
///
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = get_test_repo("test_branch")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(4), TEST_IMG("foo blk 0 at 4"), true)?;
|
||||
|
||||
// Create another relation
|
||||
let buftag2 = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: TESTREL_B,
|
||||
blknum: 0,
|
||||
});
|
||||
tline.put_page_image(buftag2, Lsn(2), TEST_IMG("foobar blk 0 at 2"), true)?;
|
||||
|
||||
tline.advance_last_valid_lsn(Lsn(4));
|
||||
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(3))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
|
||||
newtline.put_page_image(TEST_BUF(0), Lsn(4), TEST_IMG("bar blk 0 at 4"), true)?;
|
||||
newtline.advance_last_valid_lsn(Lsn(4));
|
||||
|
||||
// Check page contents on both branches
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
|
||||
TEST_IMG("foo blk 0 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
newtline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
|
||||
TEST_IMG("bar blk 0 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
newtline.get_page_at_lsn(buftag2, Lsn(4))?,
|
||||
TEST_IMG("foobar blk 0 at 2")
|
||||
);
|
||||
|
||||
assert_eq!(newtline.get_rel_size(TESTREL_B, Lsn(4))?, 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_history() -> Result<()> {
|
||||
let repo = get_test_repo("test_snapshot")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
let mut snapshot = tline.history()?;
|
||||
assert_eq!(snapshot.lsn(), Lsn(0));
|
||||
assert_eq!(None, snapshot.next().transpose()?);
|
||||
|
||||
// add a page and advance the last valid LSN
|
||||
let rel = TESTREL_A;
|
||||
let tag = TEST_BUF(1);
|
||||
tline.put_page_image(tag, Lsn(1), TEST_IMG("blk 1 @ lsn 1"), true)?;
|
||||
tline.advance_last_valid_lsn(Lsn(1));
|
||||
let mut snapshot = tline.history()?;
|
||||
assert_eq!(snapshot.lsn(), Lsn(1));
|
||||
let expected_page = RelationUpdate {
|
||||
rel: rel,
|
||||
lsn: Lsn(1),
|
||||
update: Update::Page {
|
||||
blknum: 1,
|
||||
img: TEST_IMG("blk 1 @ lsn 1"),
|
||||
},
|
||||
};
|
||||
assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
|
||||
assert_eq!(None, snapshot.next().transpose()?);
|
||||
|
||||
// truncate to zero, but don't advance the last valid LSN
|
||||
tline.put_truncation(rel, Lsn(2), 0)?;
|
||||
let mut snapshot = tline.history()?;
|
||||
assert_eq!(snapshot.lsn(), Lsn(1));
|
||||
assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
|
||||
assert_eq!(None, snapshot.next().transpose()?);
|
||||
|
||||
// advance the last valid LSN and the truncation should be observable
|
||||
tline.advance_last_valid_lsn(Lsn(2));
|
||||
let mut snapshot = tline.history()?;
|
||||
assert_eq!(snapshot.lsn(), Lsn(2));
|
||||
|
||||
// TODO ordering not guaranteed by API. But currently it returns the
|
||||
// truncation entry before the block data.
|
||||
let expected_truncate = RelationUpdate {
|
||||
rel: rel,
|
||||
lsn: Lsn(2),
|
||||
update: Update::Truncate { n_blocks: 0 },
|
||||
};
|
||||
assert_eq!(Some(expected_truncate), snapshot.next().transpose()?);
|
||||
assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
|
||||
assert_eq!(None, snapshot.next().transpose()?);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager {}
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: ObjectTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for {:?} to get to {}, with {} and {} records",
|
||||
tag,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{}", s);
|
||||
Ok(TEST_IMG(&s))
|
||||
}
|
||||
}
|
||||
}
|
||||
836
pageserver/src/restore_local_repo.rs
Normal file
836
pageserver/src/restore_local_repo.rs
Normal file
@@ -0,0 +1,836 @@
|
||||
//!
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! zenith Timeline.
|
||||
//!
|
||||
use log::*;
|
||||
use std::cmp::{max, min};
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::io::Seek;
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, Bytes};
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::repository::*;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::{pg_constants, CheckPoint, ControlFileData};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
const MAX_MBR_BLKNO: u32 =
|
||||
pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
|
||||
///
|
||||
/// Find latest snapshot in a timeline's 'snapshots' directory
|
||||
///
|
||||
pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result<Lsn> {
|
||||
let snapshotspath = format!("timelines/{}/snapshots", timeline);
|
||||
|
||||
let mut last_snapshot_lsn = Lsn(0);
|
||||
for direntry in fs::read_dir(&snapshotspath).unwrap() {
|
||||
let filename = direntry.unwrap().file_name();
|
||||
|
||||
if let Ok(lsn) = Lsn::from_filename(&filename) {
|
||||
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
|
||||
} else {
|
||||
error!("unrecognized file in snapshots directory: {:?}", filename);
|
||||
}
|
||||
}
|
||||
|
||||
if last_snapshot_lsn == Lsn(0) {
|
||||
error!("could not find valid snapshot in {}", &snapshotspath);
|
||||
// TODO return error?
|
||||
}
|
||||
Ok(last_snapshot_lsn)
|
||||
}
|
||||
|
||||
///
|
||||
/// Import all relation data pages from local disk into the repository.
|
||||
///
|
||||
pub fn import_timeline_from_postgres_datadir(
|
||||
path: &Path,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
// Scan 'global'
|
||||
for direntry in fs::read_dir(path.join("global"))? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("pg_control") => {
|
||||
import_nonrel_file(timeline, lsn, ObjectTag::ControlFile, &direntry.path())?;
|
||||
// Extract checkpoint record from pg_control and store is as separate object
|
||||
let pg_control_bytes =
|
||||
timeline.get_page_at_lsn_nowait(ObjectTag::ControlFile, lsn, false)?;
|
||||
let pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
timeline.put_page_image(ObjectTag::Checkpoint, lsn, checkpoint_bytes, false)?;
|
||||
}
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
timeline,
|
||||
lsn,
|
||||
ObjectTag::FileNodeMap(DatabaseTag {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
}),
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
timeline,
|
||||
lsn,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
)?,
|
||||
}
|
||||
}
|
||||
|
||||
// Scan 'base'. It contains database dirs, the database OID is the filename.
|
||||
// E.g. 'base/12345', where 12345 is the database OID.
|
||||
for direntry in fs::read_dir(path.join("base"))? {
|
||||
let direntry = direntry?;
|
||||
|
||||
let dboid = direntry.file_name().to_str().unwrap().parse::<u32>()?;
|
||||
|
||||
for direntry in fs::read_dir(direntry.path())? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("PG_VERSION") => continue,
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
timeline,
|
||||
lsn,
|
||||
ObjectTag::FileNodeMap(DatabaseTag {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode: dboid,
|
||||
}),
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
timeline,
|
||||
lsn,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
)?,
|
||||
}
|
||||
}
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(
|
||||
timeline,
|
||||
lsn,
|
||||
|blknum| ObjectTag::Clog(SlruBufferTag { blknum }),
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(
|
||||
timeline,
|
||||
lsn,
|
||||
|blknum| ObjectTag::MultiXactMembers(SlruBufferTag { blknum }),
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(
|
||||
timeline,
|
||||
lsn,
|
||||
|blknum| ObjectTag::MultiXactOffsets(SlruBufferTag { blknum }),
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
||||
let entry = entry?;
|
||||
let xid = u32::from_str_radix(&entry.path().to_str().unwrap(), 16)?;
|
||||
import_nonrel_file(
|
||||
timeline,
|
||||
lsn,
|
||||
ObjectTag::TwoPhase(PrepareTag { xid }),
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
timeline.checkpoint()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
fn import_relfile(
|
||||
path: &Path,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
) -> Result<()> {
|
||||
// Does it look like a relation file?
|
||||
|
||||
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
|
||||
if let Err(e) = p {
|
||||
warn!("unrecognized file in snapshot: {:?} ({})", path, e);
|
||||
return Err(e.into());
|
||||
}
|
||||
let (relnode, forknum, segno) = p.unwrap();
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
|
||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode,
|
||||
forknum,
|
||||
},
|
||||
blknum,
|
||||
});
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf), true)?;
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
blknum += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn import_nonrel_file(
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
tag: ObjectTag,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buffer[..]), false)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn import_slru_file(
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
gen_tag: fn(blknum: u32) -> ObjectTag,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
// Does it look like a relation file?
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
|
||||
let mut blknum: u32 = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
timeline.put_page_image(
|
||||
gen_tag(blknum),
|
||||
lsn,
|
||||
Bytes::copy_from_slice(&buf),
|
||||
false,
|
||||
)?;
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
blknum += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Scan PostgreSQL WAL files in given directory, and load all records >= 'startpoint' into
|
||||
/// the repository.
|
||||
pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint: Lsn) -> Result<()> {
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = startpoint;
|
||||
|
||||
let checkpoint_bytes =
|
||||
timeline.get_page_at_lsn_nowait(ObjectTag::Checkpoint, startpoint, false)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
|
||||
loop {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut path = walpath.join(&filename);
|
||||
|
||||
// It could be as .partial
|
||||
if !PathBuf::from(&path).exists() {
|
||||
path = walpath.join(filename + ".partial");
|
||||
}
|
||||
|
||||
// Slurp the WAL file
|
||||
let open_result = File::open(&path);
|
||||
if let Err(e) = &open_result {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let mut file = open_result?;
|
||||
|
||||
if offset > 0 {
|
||||
file.seek(SeekFrom::Start(offset as u64))?;
|
||||
}
|
||||
|
||||
let mut buf = Vec::new();
|
||||
let nread = file.read_to_end(&mut buf)?;
|
||||
if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
|
||||
// Maybe allow this for .partial files?
|
||||
error!("read only {} bytes from WAL file", nread);
|
||||
}
|
||||
waldecoder.feed_bytes(&buf);
|
||||
|
||||
let mut nrecords = 0;
|
||||
loop {
|
||||
let rec = waldecoder.poll_decode();
|
||||
if rec.is_err() {
|
||||
// Assume that an error means we've reached the end of
|
||||
// a partial WAL record. So that's ok.
|
||||
trace!("WAL decoder error {:?}", rec);
|
||||
break;
|
||||
}
|
||||
if let Some((lsn, recdata)) = rec.unwrap() {
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
save_decoded_record(&mut checkpoint, timeline, &decoded, recdata, lsn)?;
|
||||
last_lsn = lsn;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
nrecords += 1;
|
||||
}
|
||||
|
||||
info!(
|
||||
"imported {} records from WAL file {} up to {}",
|
||||
nrecords,
|
||||
path.display(),
|
||||
last_lsn
|
||||
);
|
||||
|
||||
segno += 1;
|
||||
offset = 0;
|
||||
}
|
||||
info!("reached end of WAL at {}", last_lsn);
|
||||
let checkpoint_bytes = checkpoint.encode();
|
||||
timeline.put_page_image(ObjectTag::Checkpoint, last_lsn, checkpoint_bytes, false)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
|
||||
/// relations/pages that the record affects.
|
||||
///
|
||||
pub fn save_decoded_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
decoded: &DecodedWALRecord,
|
||||
recdata: Bytes,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
checkpoint.update_next_xid(decoded.xl_xid);
|
||||
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
// "put" a separate copy of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
blknum: blk.blkno,
|
||||
});
|
||||
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
}
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
|
||||
// Handle a few special record types
|
||||
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE
|
||||
{
|
||||
let truncate = XlSmgrTruncate::decode(&mut buf);
|
||||
save_xlog_smgr_truncate(timeline, lsn, &truncate)?;
|
||||
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
|
||||
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE {
|
||||
let createdb = XlCreateDatabase::decode(&mut buf);
|
||||
save_xlog_dbase_create(timeline, lsn, &createdb)?;
|
||||
} else {
|
||||
// TODO
|
||||
trace!("XLOG_DBASE_DROP is not handled yet");
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
|
||||
trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
|
||||
} else if decoded.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let blknum = buf.get_u32_le();
|
||||
let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
} else {
|
||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||
checkpoint.oldestXid = buf.get_u32_le();
|
||||
checkpoint.oldestXidDB = buf.get_u32_le();
|
||||
trace!(
|
||||
"RM_CLOG_ID truncate blkno {} oldestXid {} oldestXidDB {}",
|
||||
blknum,
|
||||
checkpoint.oldestXid,
|
||||
checkpoint.oldestXidDB
|
||||
);
|
||||
if let Some(ObjectTag::Clog(first_slru_tag)) =
|
||||
timeline.get_next_tag(ObjectTag::Clog(SlruBufferTag { blknum: 0 }))?
|
||||
{
|
||||
for trunc_blknum in first_slru_tag.blknum..=blknum {
|
||||
let tag = ObjectTag::Clog(SlruBufferTag {
|
||||
blknum: trunc_blknum,
|
||||
});
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT
|
||||
|| info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
|| info == pg_constants::XLOG_XACT_ABORT
|
||||
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||
{
|
||||
let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
|
||||
save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
|
||||
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(
|
||||
ObjectTag::TwoPhase(PrepareTag {
|
||||
xid: decoded.xl_xid,
|
||||
}),
|
||||
rec,
|
||||
)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
|
||||
|| info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
|
||||
{
|
||||
let blknum = buf.get_u32_le();
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
let tag = if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
|
||||
ObjectTag::MultiXactOffsets(SlruBufferTag { blknum })
|
||||
} else {
|
||||
ObjectTag::MultiXactMembers(SlruBufferTag { blknum })
|
||||
};
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
let xlrec = XlMultiXactTruncate::decode(&mut buf);
|
||||
save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
||||
save_relmap_record(timeline, lsn, &xlrec, decoded)?;
|
||||
} else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_NEXTOID {
|
||||
let next_oid = buf.get_u32_le();
|
||||
checkpoint.nextOid = next_oid;
|
||||
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|
||||
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
buf.copy_to_slice(&mut checkpoint_bytes);
|
||||
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes).unwrap();
|
||||
trace!(
|
||||
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
|
||||
xlog_checkpoint.oldestXid,
|
||||
checkpoint.oldestXid
|
||||
);
|
||||
if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
|
||||
checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Now that this record has been handled, let the repository know that
|
||||
// it is up-to-date to this LSN
|
||||
timeline.advance_last_record_lsn(lsn);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_DBASE_CREATE record.
|
||||
fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatabase) -> Result<()> {
|
||||
let db_id = rec.db_id;
|
||||
let tablespace_id = rec.tablespace_id;
|
||||
let src_db_id = rec.src_db_id;
|
||||
let src_tablespace_id = rec.src_tablespace_id;
|
||||
|
||||
// Creating a database is implemented by copying the template (aka. source) database.
|
||||
// To copy all the relations, we need to ask for the state as of the same LSN, but we
|
||||
// cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for
|
||||
// the last valid LSN to advance up to it. So we use the previous record's LSN in the
|
||||
// get calls instead.
|
||||
let req_lsn = min(timeline.get_last_record_lsn(), lsn);
|
||||
|
||||
let rels = timeline.list_rels(src_tablespace_id, src_db_id, req_lsn)?;
|
||||
|
||||
trace!("save_create_database: {} rels", rels.len());
|
||||
|
||||
let mut num_rels_copied = 0;
|
||||
let mut num_blocks_copied = 0;
|
||||
for src_rel in rels {
|
||||
assert_eq!(src_rel.spcnode, src_tablespace_id);
|
||||
assert_eq!(src_rel.dbnode, src_db_id);
|
||||
|
||||
let nblocks = timeline.get_rel_size(src_rel, req_lsn)?;
|
||||
let dst_rel = RelTag {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
relnode: src_rel.relnode,
|
||||
forknum: src_rel.forknum,
|
||||
};
|
||||
|
||||
// Copy content
|
||||
for blknum in 0..nblocks {
|
||||
let src_key = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: src_rel,
|
||||
blknum,
|
||||
});
|
||||
let dst_key = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: dst_rel,
|
||||
blknum,
|
||||
});
|
||||
|
||||
let content = timeline.get_page_at_lsn_nowait(src_key, req_lsn, false)?;
|
||||
|
||||
debug!("copying block {:?} to {:?}", src_key, dst_key);
|
||||
|
||||
timeline.put_page_image(dst_key, lsn, content, true)?;
|
||||
num_blocks_copied += 1;
|
||||
}
|
||||
|
||||
if nblocks == 0 {
|
||||
// make sure we have some trace of the relation, even if it's empty
|
||||
timeline.put_truncation(dst_rel, lsn, 0)?;
|
||||
}
|
||||
|
||||
num_rels_copied += 1;
|
||||
}
|
||||
// Copy relfilemap
|
||||
for tag in timeline.list_nonrels(req_lsn)? {
|
||||
match tag {
|
||||
ObjectTag::FileNodeMap(db) => {
|
||||
if db.spcnode == src_tablespace_id && db.dbnode == src_db_id {
|
||||
let img = timeline.get_page_at_lsn_nowait(tag, req_lsn, false)?;
|
||||
let new_tag = ObjectTag::FileNodeMap(DatabaseTag {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
});
|
||||
timeline.put_page_image(new_tag, lsn, img, false)?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => {} // do nothing
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"Created database {}/{}, copied {} blocks in {} rels at {}",
|
||||
tablespace_id, db_id, num_blocks_copied, num_rels_copied, lsn
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_SMGR_TRUNCATE record.
|
||||
///
|
||||
/// This is the same logic as in PostgreSQL's smgr_redo() function.
|
||||
fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTruncate) -> Result<()> {
|
||||
let spcnode = rec.rnode.spcnode;
|
||||
let dbnode = rec.rnode.dbnode;
|
||||
let relnode = rec.rnode.relnode;
|
||||
|
||||
if (rec.flags & pg_constants::SMGR_TRUNCATE_HEAP) != 0 {
|
||||
let rel = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum: pg_constants::MAIN_FORKNUM,
|
||||
};
|
||||
timeline.put_truncation(rel, lsn, rec.blkno)?;
|
||||
}
|
||||
if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 {
|
||||
let rel = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum: pg_constants::FSM_FORKNUM,
|
||||
};
|
||||
|
||||
// FIXME: 'blkno' stored in the WAL record is the new size of the
|
||||
// heap. The formula for calculating the new size of the FSM is
|
||||
// pretty complicated (see FreeSpaceMapPrepareTruncateRel() in
|
||||
// PostgreSQL), and we should also clear bits in the tail FSM block,
|
||||
// and update the upper level FSM pages. None of that has been
|
||||
// implemented. What we do instead, is always just truncate the FSM
|
||||
// to zero blocks. That's bad for performance, but safe. (The FSM
|
||||
// isn't needed for correctness, so we could also leave garbage in
|
||||
// it. Seems more tidy to zap it away.)
|
||||
if rec.blkno != 0 {
|
||||
info!("Partial truncation of FSM is not supported");
|
||||
}
|
||||
let num_fsm_blocks = 0;
|
||||
timeline.put_truncation(rel, lsn, num_fsm_blocks)?;
|
||||
}
|
||||
if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 {
|
||||
let rel = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
};
|
||||
|
||||
// FIXME: Like with the FSM above, the logic to truncate the VM
|
||||
// correctly has not been implemented. Just zap it away completely,
|
||||
// always. Unlike the FSM, the VM must never have bits incorrectly
|
||||
// set. From a correctness point of view, it's always OK to clear
|
||||
// bits or remove it altogether, though.
|
||||
if rec.blkno != 0 {
|
||||
info!("Partial truncation of VM is not supported");
|
||||
}
|
||||
let num_vm_blocks = 0;
|
||||
timeline.put_truncation(rel, lsn, num_vm_blocks)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records.
|
||||
///
|
||||
/// We are currently only interested in the dropped relations.
|
||||
fn save_xact_record(
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
parsed: &XlXactParsedRecord,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
// Record update of CLOG page
|
||||
let mut blknum = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
|
||||
for subxact in &parsed.subxacts {
|
||||
let subxact_blknum = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if subxact_blknum != blknum {
|
||||
blknum = subxact_blknum;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
}
|
||||
}
|
||||
for xnode in &parsed.xnodes {
|
||||
for forknum in pg_constants::MAIN_FORKNUM..=pg_constants::VISIBILITYMAP_FORKNUM {
|
||||
let rel_tag = RelTag {
|
||||
forknum,
|
||||
spcnode: xnode.spcnode,
|
||||
dbnode: xnode.dbnode,
|
||||
relnode: xnode.relnode,
|
||||
};
|
||||
timeline.put_unlink(rel_tag, lsn)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_multixact_create_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactCreate,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
let blknum = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
|
||||
let first_mbr_blkno = xlrec.moff / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_blkno =
|
||||
(xlrec.moff + xlrec.nmembers - 1) / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
// The members SLRU can, in contrast to the offsets one, be filled to almost
|
||||
// the full range at once. So we need to handle wraparound.
|
||||
let mut blknum = first_mbr_blkno;
|
||||
loop {
|
||||
// Update members page
|
||||
let tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
|
||||
if blknum == last_mbr_blkno {
|
||||
// last block inclusive
|
||||
break;
|
||||
}
|
||||
|
||||
// handle wraparound
|
||||
if blknum == MAX_MBR_BLKNO {
|
||||
blknum = 0;
|
||||
} else {
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
if xlrec.mid >= checkpoint.nextMulti {
|
||||
checkpoint.nextMulti = xlrec.mid + 1;
|
||||
}
|
||||
if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
|
||||
checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
|
||||
}
|
||||
let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
|
||||
if mbr.xid.wrapping_sub(acc) as i32 > 0 {
|
||||
mbr.xid
|
||||
} else {
|
||||
acc
|
||||
}
|
||||
});
|
||||
checkpoint.update_next_xid(max_mbr_xid);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_multixact_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
) -> Result<()> {
|
||||
checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||
checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
||||
let first_off_blkno = xlrec.start_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let last_off_blkno = xlrec.end_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
// Delete all the segments except the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
for blknum in first_off_blkno..last_off_blkno {
|
||||
let tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
}
|
||||
let first_mbr_blkno = xlrec.start_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_blkno = xlrec.end_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
// The members SLRU can, in contrast to the offsets one, be filled to almost
|
||||
// the full range at once. So we need to handle wraparound.
|
||||
let mut blknum = first_mbr_blkno;
|
||||
// Delete all the segments but the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
while blknum != last_mbr_blkno {
|
||||
let tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
// handle wraparound
|
||||
if blknum == MAX_MBR_BLKNO {
|
||||
blknum = 0;
|
||||
} else {
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_relmap_record(
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlRelmapUpdate,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
let tag = ObjectTag::FileNodeMap(DatabaseTag {
|
||||
spcnode: xlrec.tsid,
|
||||
dbnode: xlrec.dbid,
|
||||
});
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,323 +0,0 @@
|
||||
//
|
||||
// Restore chunks from S3
|
||||
//
|
||||
// This runs once at Page Server startup. It loads all the "base images" from
|
||||
// S3 into the in-memory page cache. It also initializes the "last valid LSN"
|
||||
// in the page cache to the LSN of the base image, so that when the WAL receiver
|
||||
// is started, it starts streaming from that LSN.
|
||||
//
|
||||
|
||||
use bytes::{Buf, BytesMut};
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::env;
|
||||
use std::fmt;
|
||||
|
||||
use s3::bucket::Bucket;
|
||||
use s3::creds::Credentials;
|
||||
use s3::region::Region;
|
||||
use s3::S3Error;
|
||||
|
||||
use tokio::runtime;
|
||||
|
||||
use futures::future;
|
||||
|
||||
use crate::{page_cache, PageServerConf};
|
||||
|
||||
struct Storage {
|
||||
region: Region,
|
||||
credentials: Credentials,
|
||||
bucket: String,
|
||||
}
|
||||
|
||||
pub fn restore_main(conf: &PageServerConf) {
|
||||
// Create a new thread pool
|
||||
let runtime = runtime::Runtime::new().unwrap();
|
||||
|
||||
runtime.block_on(async {
|
||||
let result = restore_chunk(conf).await;
|
||||
|
||||
match result {
|
||||
Ok(_) => {
|
||||
return;
|
||||
}
|
||||
Err(err) => {
|
||||
error!("S3 error: {}", err);
|
||||
return;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
//
|
||||
// Restores one chunk from S3.
|
||||
//
|
||||
// 1. Fetch the last base image >= given LSN
|
||||
// 2. Fetch all WAL
|
||||
//
|
||||
// Load it all into the page cache.
|
||||
//
|
||||
async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
let backend = Storage {
|
||||
region: Region::Custom {
|
||||
region: env::var("S3_REGION").unwrap().into(),
|
||||
endpoint: env::var("S3_ENDPOINT").unwrap().into(),
|
||||
},
|
||||
credentials: Credentials::new(
|
||||
Some(&env::var("S3_ACCESSKEY").unwrap()),
|
||||
Some(&env::var("S3_SECRET").unwrap()),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.unwrap(),
|
||||
bucket: "zenith-testbucket".to_string(),
|
||||
};
|
||||
|
||||
info!("Restoring from S3...");
|
||||
|
||||
// Create Bucket in REGION for BUCKET
|
||||
let bucket = Bucket::new_with_path_style(&backend.bucket, backend.region, backend.credentials)?;
|
||||
|
||||
// List out contents of directory
|
||||
let results: Vec<s3::serde_types::ListBucketResult> = bucket
|
||||
.list("relationdata/".to_string(), Some("".to_string()))
|
||||
.await?;
|
||||
|
||||
// TODO: get that from backup
|
||||
let sys_id: u64 = 42;
|
||||
let mut oldest_lsn = 0;
|
||||
let mut slurp_futures: Vec<_> = Vec::new();
|
||||
|
||||
for result in results {
|
||||
for object in result.contents {
|
||||
// Download every relation file, slurping them into memory
|
||||
|
||||
let key = object.key;
|
||||
let relpath = key.strip_prefix("relationdata/").unwrap();
|
||||
|
||||
let parsed = parse_rel_file_path(&relpath);
|
||||
|
||||
match parsed {
|
||||
Ok(p) => {
|
||||
if oldest_lsn == 0 || p.lsn < oldest_lsn {
|
||||
oldest_lsn = p.lsn;
|
||||
}
|
||||
let b = bucket.clone();
|
||||
let f = slurp_base_file(conf, sys_id, b, key.to_string(), p);
|
||||
|
||||
slurp_futures.push(f);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("unrecognized file: {} ({})", relpath, e);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if oldest_lsn == 0 {
|
||||
panic!("no base backup found");
|
||||
}
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf.clone(), sys_id);
|
||||
pcache.init_valid_lsn(oldest_lsn);
|
||||
|
||||
info!("{} files to restore...", slurp_futures.len());
|
||||
|
||||
future::join_all(slurp_futures).await;
|
||||
info!("restored!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
// FIXME: we'll probably need these elsewhere too, move to some common location
|
||||
const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl FilePathError {
|
||||
fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(0),
|
||||
Some("fsm") => Ok(1),
|
||||
Some("vm") => Ok(2),
|
||||
Some("init") => Ok(3),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ParsedBaseImageFileName {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u32,
|
||||
pub segno: u32,
|
||||
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
// formats:
|
||||
// <oid>
|
||||
// <oid>_<fork name>
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?_(?P<lsnhi>[[:xdigit:]]{8})(?P<lsnlo>[[:xdigit:]]{8})$").unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(fname)
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = u32::from_str_radix(relnode_str, 10)?;
|
||||
|
||||
let forkname_match = caps.name("forkname");
|
||||
let forkname = if forkname_match.is_none() {
|
||||
None
|
||||
} else {
|
||||
Some(forkname_match.unwrap().as_str())
|
||||
};
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
|
||||
};
|
||||
|
||||
let lsn_hi = u64::from_str_radix(caps.name("lsnhi").unwrap().as_str(), 16)?;
|
||||
let lsn_lo = u64::from_str_radix(caps.name("lsnlo").unwrap().as_str(), 16)?;
|
||||
let lsn = lsn_hi << 32 | lsn_lo;
|
||||
|
||||
return Ok((relnode, forknum, segno, lsn));
|
||||
}
|
||||
|
||||
fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
|
||||
/*
|
||||
* Relation data files can be in one of the following directories:
|
||||
*
|
||||
* global/
|
||||
* shared relations
|
||||
*
|
||||
* base/<db oid>/
|
||||
* regular relations, default tablespace
|
||||
*
|
||||
* pg_tblspc/<tblspc oid>/<tblspc version>/
|
||||
* within a non-default tablespace (the name of the directory
|
||||
* depends on version)
|
||||
*
|
||||
* And the relation data files themselves have a filename like:
|
||||
*
|
||||
* <oid>.<segment number>
|
||||
*/
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode,
|
||||
forknum,
|
||||
segno,
|
||||
lsn,
|
||||
});
|
||||
} else if let Some(dbpath) = path.strip_prefix("base/") {
|
||||
let mut s = dbpath.split("/");
|
||||
let dbnode_str = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
let dbnode = u32::from_str_radix(dbnode_str, 10)?;
|
||||
let fname = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
if s.next().is_some() {
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
};
|
||||
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: DEFAULTTABLESPACE_OID,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum,
|
||||
segno,
|
||||
lsn,
|
||||
});
|
||||
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
|
||||
// TODO
|
||||
return Err(FilePathError::new("tablespaces not supported"));
|
||||
} else {
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Load a base file from S3, and insert it into the page cache
|
||||
//
|
||||
async fn slurp_base_file(
|
||||
conf: &PageServerConf,
|
||||
sys_id: u64,
|
||||
bucket: Bucket,
|
||||
s3path: String,
|
||||
parsed: ParsedBaseImageFileName,
|
||||
) {
|
||||
// FIXME: rust-s3 opens a new connection for each request. Should reuse
|
||||
// the reqwest::Client object. But that requires changes to rust-s3 itself.
|
||||
let (data, code) = bucket.get_object(s3path.clone()).await.unwrap();
|
||||
|
||||
trace!("got response: {} on {}", code, &s3path);
|
||||
assert_eq!(200, code);
|
||||
|
||||
let mut bytes = BytesMut::from(data.as_slice()).freeze();
|
||||
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf.clone(), sys_id);
|
||||
|
||||
while bytes.remaining() >= 8192 {
|
||||
let tag = page_cache::BufferTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
blknum: blknum,
|
||||
};
|
||||
|
||||
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));
|
||||
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
443
pageserver/src/rocksdb_storage.rs
Normal file
443
pageserver/src/rocksdb_storage.rs
Normal file
@@ -0,0 +1,443 @@
|
||||
//!
|
||||
//! An implementation of the ObjectStore interface, backed by RocksDB
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::object_store::ObjectStore;
|
||||
use crate::repository::RelTag;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct StorageKey {
|
||||
obj_key: ObjectKey,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl StorageKey {
|
||||
/// The first key for a given timeline
|
||||
fn timeline_start(timeline: ZTimelineId) -> Self {
|
||||
Self {
|
||||
obj_key: ObjectKey {
|
||||
timeline,
|
||||
tag: ObjectTag::TimelineMetadataTag,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// RocksDB very inefficiently delete random record. Instead of it we have to use merge
|
||||
/// filter, which allows to throw away records at LSM merge phase.
|
||||
/// Unfortunately, it is hard (if ever possible) to determine whether version can be removed
|
||||
/// at merge time. Version ca be removed if:
|
||||
/// 1. It is above PITR horizon (we need to get current LSN and gc_horizon from config)
|
||||
/// 2. Page is reconstructed at horizon (all WAL records above horizon are applied and can be removed)
|
||||
///
|
||||
/// So we have GC process which reconstructs pages at horizon and mark deteriorated WAL record
|
||||
/// for deletion. To mark object for deletion we can either set some flag in object itself.
|
||||
/// But it is complicated with new object value format, because RocksDB storage knows nothing about
|
||||
/// this format. Also updating whole record just to set one bit seems to be inefficient in any case.
|
||||
/// This is why we keep keys of marked for deletion versions in HashSet in memory.
|
||||
/// When LSM merge filter found key in this map, it removes it from the set preventing memory overflow.
|
||||
///
|
||||
struct GarbageCollector {
|
||||
garbage: Mutex<HashSet<Vec<u8>>>,
|
||||
}
|
||||
|
||||
impl GarbageCollector {
|
||||
fn new() -> GarbageCollector {
|
||||
GarbageCollector {
|
||||
garbage: Mutex::new(HashSet::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Called by GC to mark version as delete
|
||||
fn mark_for_deletion(&self, key: &[u8]) {
|
||||
let mut garbage = self.garbage.lock().unwrap();
|
||||
garbage.insert(key.to_vec());
|
||||
}
|
||||
|
||||
/// Called by LSM merge filter. If it finds key in the set, then
|
||||
/// it doesn't merge it and removes from this set.
|
||||
fn was_deleted(&self, key: &[u8]) -> bool {
|
||||
let key = key.to_vec();
|
||||
let mut garbage = self.garbage.lock().unwrap();
|
||||
garbage.remove(&key)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RocksObjectStore {
|
||||
_conf: &'static PageServerConf,
|
||||
|
||||
// RocksDB handle
|
||||
db: rocksdb::DB,
|
||||
gc: Arc<GarbageCollector>,
|
||||
}
|
||||
|
||||
impl ObjectStore for RocksObjectStore {
|
||||
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>> {
|
||||
let val = self.db.get(StorageKey::ser(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
})?)?;
|
||||
if let Some(val) = val {
|
||||
Ok(val)
|
||||
} else {
|
||||
bail!("could not find page {:?}", key);
|
||||
}
|
||||
}
|
||||
|
||||
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
|
||||
let mut iter = self.db.raw_iterator();
|
||||
let search_key = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
iter.seek(search_key.ser()?);
|
||||
if !iter.valid() {
|
||||
Ok(None)
|
||||
} else {
|
||||
let key = StorageKey::des(iter.key().unwrap())?;
|
||||
Ok(Some(key.obj_key.clone()))
|
||||
}
|
||||
}
|
||||
|
||||
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
|
||||
self.db.put(
|
||||
StorageKey::ser(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
})?,
|
||||
value,
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()> {
|
||||
self.gc.mark_for_deletion(&StorageKey::ser(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
})?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Iterate through page versions of given page, starting from the given LSN.
|
||||
/// The versions are walked in descending LSN order.
|
||||
fn object_versions<'a>(
|
||||
&'a self,
|
||||
key: &ObjectKey,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>> {
|
||||
let iter = RocksObjectVersionIter::new(&self.db, key, lsn)?;
|
||||
Ok(Box::new(iter))
|
||||
}
|
||||
|
||||
/// Iterate through all timeline objects
|
||||
fn list_objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
|
||||
let iter = RocksObjectIter::new(&self.db, timeline, nonrel_only, lsn)?;
|
||||
Ok(Box::new(iter))
|
||||
}
|
||||
|
||||
/// Get a list of all distinct relations in given tablespace and database.
|
||||
///
|
||||
/// TODO: This implementation is very inefficient, it scans
|
||||
/// through all entries in the given database. In practice, this
|
||||
/// is used for CREATE DATABASE, and usually the template database is small.
|
||||
/// But if it's not, this will be slow.
|
||||
fn list_rels(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashSet<RelTag>> {
|
||||
// FIXME: This scans everything. Very slow
|
||||
|
||||
let mut rels: HashSet<RelTag> = HashSet::new();
|
||||
|
||||
let mut search_rel_tag = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: 0,
|
||||
forknum: 0u8,
|
||||
};
|
||||
let mut iter = self.db.raw_iterator();
|
||||
loop {
|
||||
let search_key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline: timelineid,
|
||||
tag: ObjectTag::RelationMetadata(search_rel_tag),
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
iter.seek(search_key.ser()?);
|
||||
if !iter.valid() {
|
||||
break;
|
||||
}
|
||||
let key = StorageKey::des(iter.key().unwrap())?;
|
||||
|
||||
if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
|
||||
if spcnode != 0 && rel_tag.spcnode != spcnode
|
||||
|| dbnode != 0 && rel_tag.dbnode != dbnode
|
||||
{
|
||||
break;
|
||||
}
|
||||
if key.lsn <= lsn {
|
||||
// visible in this snapshot
|
||||
rels.insert(rel_tag);
|
||||
}
|
||||
search_rel_tag = rel_tag;
|
||||
// skip to next relation
|
||||
// FIXME: What if relnode is u32::MAX ?
|
||||
search_rel_tag.relnode += 1;
|
||||
} else {
|
||||
// no more relation metadata entries
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Iterate through versions of all objects in a timeline.
|
||||
///
|
||||
/// Returns objects in increasing key-version order.
|
||||
/// Returns all versions up to and including the specified LSN.
|
||||
fn objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>> {
|
||||
let start_key = StorageKey::timeline_start(timeline);
|
||||
let start_key_bytes = StorageKey::ser(&start_key)?;
|
||||
let iter = self.db.iterator(rocksdb::IteratorMode::From(
|
||||
&start_key_bytes,
|
||||
rocksdb::Direction::Forward,
|
||||
));
|
||||
|
||||
Ok(Box::new(RocksObjects {
|
||||
iter,
|
||||
timeline,
|
||||
lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
fn compact(&self) {
|
||||
self.db.compact_range::<&[u8], &[u8]>(None, None);
|
||||
}
|
||||
}
|
||||
|
||||
impl RocksObjectStore {
|
||||
/// Open a RocksDB database.
|
||||
pub fn open(conf: &'static PageServerConf) -> Result<RocksObjectStore> {
|
||||
let opts = Self::get_rocksdb_opts();
|
||||
let obj_store = Self::new(conf, opts)?;
|
||||
Ok(obj_store)
|
||||
}
|
||||
|
||||
/// Create a new, empty RocksDB database.
|
||||
pub fn create(conf: &'static PageServerConf) -> Result<RocksObjectStore> {
|
||||
let path = conf.workdir.join("rocksdb-storage");
|
||||
std::fs::create_dir(&path)?;
|
||||
|
||||
let mut opts = Self::get_rocksdb_opts();
|
||||
opts.create_if_missing(true);
|
||||
opts.set_error_if_exists(true);
|
||||
let obj_store = Self::new(conf, opts)?;
|
||||
Ok(obj_store)
|
||||
}
|
||||
|
||||
fn new(conf: &'static PageServerConf, mut opts: rocksdb::Options) -> Result<RocksObjectStore> {
|
||||
let path = conf.workdir.join("rocksdb-storage");
|
||||
let gc = Arc::new(GarbageCollector::new());
|
||||
let gc_ref = gc.clone();
|
||||
opts.set_compaction_filter("ttl", move |_level: u32, key: &[u8], _val: &[u8]| {
|
||||
if gc_ref.was_deleted(key) {
|
||||
rocksdb::compaction_filter::Decision::Remove
|
||||
} else {
|
||||
rocksdb::compaction_filter::Decision::Keep
|
||||
}
|
||||
});
|
||||
let db = rocksdb::DB::open(&opts, &path)?;
|
||||
let obj_store = RocksObjectStore {
|
||||
_conf: conf,
|
||||
db,
|
||||
gc,
|
||||
};
|
||||
Ok(obj_store)
|
||||
}
|
||||
|
||||
/// common options used by `open` and `create`
|
||||
fn get_rocksdb_opts() -> rocksdb::Options {
|
||||
let mut opts = rocksdb::Options::default();
|
||||
opts.set_use_fsync(true);
|
||||
opts.set_compression_type(rocksdb::DBCompressionType::Lz4);
|
||||
opts
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `object_versions`. Returns all page versions of a given block, in
|
||||
/// reverse LSN order.
|
||||
///
|
||||
struct RocksObjectVersionIter<'a> {
|
||||
obj_key: ObjectKey,
|
||||
dbiter: rocksdb::DBRawIterator<'a>,
|
||||
first_call: bool,
|
||||
}
|
||||
impl<'a> RocksObjectVersionIter<'a> {
|
||||
fn new(
|
||||
db: &'a rocksdb::DB,
|
||||
obj_key: &ObjectKey,
|
||||
lsn: Lsn,
|
||||
) -> Result<RocksObjectVersionIter<'a>> {
|
||||
let key = StorageKey {
|
||||
obj_key: obj_key.clone(),
|
||||
lsn,
|
||||
};
|
||||
let mut dbiter = db.raw_iterator();
|
||||
dbiter.seek_for_prev(StorageKey::ser(&key)?); // locate last entry
|
||||
Ok(RocksObjectVersionIter {
|
||||
first_call: true,
|
||||
obj_key: obj_key.clone(),
|
||||
dbiter,
|
||||
})
|
||||
}
|
||||
}
|
||||
impl<'a> Iterator for RocksObjectVersionIter<'a> {
|
||||
type Item = (Lsn, Vec<u8>);
|
||||
|
||||
fn next(&mut self) -> std::option::Option<Self::Item> {
|
||||
if self.first_call {
|
||||
self.first_call = false;
|
||||
} else {
|
||||
self.dbiter.prev(); // walk backwards
|
||||
}
|
||||
|
||||
if !self.dbiter.valid() {
|
||||
return None;
|
||||
}
|
||||
let key = StorageKey::des(self.dbiter.key().unwrap()).unwrap();
|
||||
if key.obj_key.tag != self.obj_key.tag {
|
||||
return None;
|
||||
}
|
||||
let val = self.dbiter.value().unwrap();
|
||||
let result = val.to_vec();
|
||||
|
||||
Some((key.lsn, result))
|
||||
}
|
||||
}
|
||||
|
||||
struct RocksObjects<'r> {
|
||||
iter: rocksdb::DBIterator<'r>,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl<'r> Iterator for RocksObjects<'r> {
|
||||
// TODO consider returning Box<[u8]>
|
||||
type Item = Result<(ObjectTag, Lsn, Vec<u8>)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_result().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'r> RocksObjects<'r> {
|
||||
fn next_result(&mut self) -> Result<Option<(ObjectTag, Lsn, Vec<u8>)>> {
|
||||
for (key_bytes, v) in &mut self.iter {
|
||||
let key = StorageKey::des(&key_bytes)?;
|
||||
|
||||
if key.obj_key.timeline != self.timeline {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if key.lsn > self.lsn {
|
||||
// TODO can speed up by seeking iterator
|
||||
continue;
|
||||
}
|
||||
|
||||
return Ok(Some((key.obj_key.tag, key.lsn, v.to_vec())));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `list_objects`. Returns all objects preceeding specified LSN
|
||||
///
|
||||
struct RocksObjectIter<'a> {
|
||||
timeline: ZTimelineId,
|
||||
key: StorageKey,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
dbiter: rocksdb::DBRawIterator<'a>,
|
||||
}
|
||||
impl<'a> RocksObjectIter<'a> {
|
||||
fn new(
|
||||
db: &'a rocksdb::DB,
|
||||
timeline: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
) -> Result<RocksObjectIter<'a>> {
|
||||
let key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline,
|
||||
tag: ObjectTag::FirstTag,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let dbiter = db.raw_iterator();
|
||||
Ok(RocksObjectIter {
|
||||
key,
|
||||
timeline,
|
||||
nonrel_only,
|
||||
lsn,
|
||||
dbiter,
|
||||
})
|
||||
}
|
||||
}
|
||||
impl<'a> Iterator for RocksObjectIter<'a> {
|
||||
type Item = ObjectTag;
|
||||
|
||||
fn next(&mut self) -> std::option::Option<Self::Item> {
|
||||
loop {
|
||||
self.dbiter.seek(StorageKey::ser(&self.key).unwrap());
|
||||
if !self.dbiter.valid() {
|
||||
return None;
|
||||
}
|
||||
let key = StorageKey::des(self.dbiter.key().unwrap()).unwrap();
|
||||
if key.obj_key.timeline != self.timeline {
|
||||
// End of this timeline
|
||||
return None;
|
||||
}
|
||||
self.key = key.clone();
|
||||
self.key.lsn = Lsn(u64::MAX); // next seek should skip all versions
|
||||
if key.lsn <= self.lsn {
|
||||
// visible in this snapshot
|
||||
if self.nonrel_only {
|
||||
match key.obj_key.tag {
|
||||
ObjectTag::RelationMetadata(_) => return None,
|
||||
ObjectTag::RelationBuffer(_) => return None,
|
||||
_ => return Some(key.obj_key.tag),
|
||||
}
|
||||
} else {
|
||||
return Some(key.obj_key.tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -14,7 +14,6 @@ use tui::text::{Span, Spans, Text};
|
||||
use tui::widgets::{Block, BorderType, Borders, Paragraph, Widget};
|
||||
use tui::Terminal;
|
||||
|
||||
use slog;
|
||||
use slog::Drain;
|
||||
|
||||
lazy_static! {
|
||||
@@ -32,7 +31,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
false
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -42,7 +41,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
false
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -53,7 +52,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
false
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -66,7 +65,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
false
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -85,14 +84,14 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
false
|
||||
})
|
||||
.fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
return slog_scope::set_global_logger(logger);
|
||||
slog_scope::set_global_logger(logger)
|
||||
}
|
||||
|
||||
pub fn ui_main<'b>() -> Result<(), Box<dyn Error>> {
|
||||
pub fn ui_main() -> Result<(), Box<dyn Error>> {
|
||||
// Terminal initialization
|
||||
let stdout = io::stdout().into_raw_mode()?;
|
||||
let stdout = MouseTerminal::from(stdout);
|
||||
@@ -172,6 +171,11 @@ pub fn ui_main<'b>() -> Result<(), Box<dyn Error>> {
|
||||
})?;
|
||||
|
||||
// If ther user presses 'q', quit.
|
||||
|
||||
// silence clippy's suggestion to rewrite this as an if-statement. Match
|
||||
// makes more sense as soon as we get another command than 'q'.
|
||||
#[allow(clippy::single_match)]
|
||||
#[allow(clippy::collapsible_match)]
|
||||
if let Event::Input(key) = events.next()? {
|
||||
match key {
|
||||
Key::Char('q') => {
|
||||
@@ -188,6 +192,7 @@ pub fn ui_main<'b>() -> Result<(), Box<dyn Error>> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
struct LogWidget<'a> {
|
||||
logger: &'a TuiLogger,
|
||||
title: &'a str,
|
||||
@@ -229,7 +234,7 @@ impl<'a> Widget for LogWidget<'a> {
|
||||
// Render a widget to show some metrics
|
||||
struct MetricsWidget {}
|
||||
|
||||
fn get_metric_u64<'a>(title: &'a str, value: u64) -> Spans<'a> {
|
||||
fn _get_metric_u64(title: &str, value: u64) -> Spans {
|
||||
Spans::from(vec![
|
||||
Span::styled(format!("{:<20}", title), Style::default()),
|
||||
Span::raw(": "),
|
||||
@@ -240,7 +245,9 @@ fn get_metric_u64<'a>(title: &'a str, value: u64) -> Spans<'a> {
|
||||
])
|
||||
}
|
||||
|
||||
fn get_metric_str<'a>(title: &'a str, value: &'a str) -> Spans<'a> {
|
||||
// This is not used since LSNs were removed from page cache stats.
|
||||
// Maybe it will be used in the future?
|
||||
fn _get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
|
||||
Spans::from(vec![
|
||||
Span::styled(format!("{:<20}", title), Style::default()),
|
||||
Span::raw(": "),
|
||||
@@ -248,13 +255,6 @@ fn get_metric_str<'a>(title: &'a str, value: &'a str) -> Spans<'a> {
|
||||
])
|
||||
}
|
||||
|
||||
// FIXME: We really should define a datatype for LSNs, with Display trait and
|
||||
// helper functions. There's one in tokio-postgres, but I don't think we want
|
||||
// to rely on that.
|
||||
fn format_lsn(lsn: u64) -> String {
|
||||
return format!("{:X}/{:X}", lsn >> 32, lsn & 0xffff_ffff);
|
||||
}
|
||||
|
||||
impl tui::widgets::Widget for MetricsWidget {
|
||||
fn render(self, area: Rect, buf: &mut Buffer) {
|
||||
let block = Block::default()
|
||||
@@ -265,17 +265,24 @@ impl tui::widgets::Widget for MetricsWidget {
|
||||
|
||||
block.render(area, buf);
|
||||
|
||||
#[allow(unused_mut)]
|
||||
let mut lines: Vec<Spans> = Vec::new();
|
||||
|
||||
let page_cache_stats = crate::page_cache::get_stats();
|
||||
// FIXME
|
||||
//let page_cache_stats = crate::page_cache::get_stats();
|
||||
|
||||
// This is not used since LSNs were removed from page cache stats.
|
||||
// Maybe it will be used in the future?
|
||||
/*
|
||||
let lsnrange = format!(
|
||||
"{} - {}",
|
||||
format_lsn(page_cache_stats.first_valid_lsn),
|
||||
format_lsn(page_cache_stats.last_valid_lsn)
|
||||
page_cache_stats.first_valid_lsn, page_cache_stats.last_valid_lsn
|
||||
);
|
||||
let last_valid_recordlsn_str = format_lsn(page_cache_stats.last_record_lsn);
|
||||
let last_valid_recordlsn_str = page_cache_stats.last_record_lsn.to_string();
|
||||
lines.push(get_metric_str("Valid LSN range", &lsnrange));
|
||||
lines.push(get_metric_str("Last record LSN", &last_valid_recordlsn_str));
|
||||
*/
|
||||
/*
|
||||
lines.push(get_metric_u64(
|
||||
"# of cache entries",
|
||||
page_cache_stats.num_entries,
|
||||
@@ -292,7 +299,7 @@ impl tui::widgets::Widget for MetricsWidget {
|
||||
"# of GetPage@LSN calls",
|
||||
page_cache_stats.num_getpage_requests,
|
||||
));
|
||||
|
||||
*/
|
||||
let text = Text::from(lines);
|
||||
|
||||
Paragraph::new(text).render(inner_area, buf);
|
||||
|
||||
@@ -10,7 +10,6 @@ use std::time::Duration;
|
||||
use termion::event::Key;
|
||||
use termion::input::TermRead;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub enum Event<I> {
|
||||
Input(I),
|
||||
Tick,
|
||||
@@ -55,14 +54,14 @@ impl Events {
|
||||
thread::spawn(move || {
|
||||
let stdin = io::stdin();
|
||||
for evt in stdin.keys() {
|
||||
if let Ok(key) = evt {
|
||||
if let Err(err) = tx.send(Event::Input(key)) {
|
||||
eprintln!("{}", err);
|
||||
return;
|
||||
}
|
||||
if !ignore_exit_key.load(Ordering::Relaxed) && key == config.exit_key {
|
||||
return;
|
||||
}
|
||||
// This will panic if stdin returns EOF.
|
||||
let key = evt.unwrap();
|
||||
if let Err(err) = tx.send(Event::Input(key)) {
|
||||
eprintln!("{}", err);
|
||||
return;
|
||||
}
|
||||
if !ignore_exit_key.load(Ordering::Relaxed) && key == config.exit_key {
|
||||
return;
|
||||
}
|
||||
}
|
||||
})
|
||||
@@ -77,8 +76,8 @@ impl Events {
|
||||
};
|
||||
Events {
|
||||
rx,
|
||||
ignore_exit_key,
|
||||
input_handle,
|
||||
ignore_exit_key,
|
||||
tick_handle,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
//
|
||||
use chrono::offset::Local;
|
||||
use chrono::DateTime;
|
||||
use slog;
|
||||
use slog::{Drain, Level, OwnedKVList, Record};
|
||||
use slog_async::AsyncRecord;
|
||||
use std::collections::VecDeque;
|
||||
@@ -52,7 +51,7 @@ impl Drain for TuiLogger {
|
||||
events.pop_back();
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,7 +80,7 @@ impl<'b> TuiLoggerWidget<'b> {
|
||||
style_trace: None,
|
||||
style_info: None,
|
||||
show_module: true,
|
||||
logger: logger,
|
||||
logger,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -168,7 +167,7 @@ impl<'b> Widget for TuiLoggerWidget<'b> {
|
||||
Level::Debug => (self.style_debug, "DEBUG", true),
|
||||
Level::Trace => (self.style_trace, "TRACE", true),
|
||||
};
|
||||
line.push(Span::styled(txt, lvl_style.unwrap_or(Style::default())));
|
||||
line.push(Span::styled(txt, lvl_style.unwrap_or_default()));
|
||||
|
||||
if self.show_module {
|
||||
line.push(Span::raw(" "));
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,167 +1,217 @@
|
||||
//
|
||||
// WAL receiver
|
||||
//
|
||||
// The WAL receiver connects to the WAL safekeeper service, and streams WAL.
|
||||
// For each WAL record, it decodes the record to figure out which data blocks
|
||||
// the record affects, and adds the records to the page cache.
|
||||
//
|
||||
use log::*;
|
||||
|
||||
use tokio::runtime;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use tokio_stream::StreamExt;
|
||||
//!
|
||||
//! WAL receiver connects to the WAL safekeeper service,
|
||||
//! streams WAL, decodes records and saves them in page cache.
|
||||
//!
|
||||
//! We keep one WAL receiver active per timeline.
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::BufferTag;
|
||||
use crate::waldecoder::WalStreamDecoder;
|
||||
use crate::restore_local_repo;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::{Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use postgres::fallible_iterator::FallibleIterator;
|
||||
use postgres::replication::ReplicationIter;
|
||||
use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use tokio_postgres::{connect_replication, Error, NoTls, ReplicationMode};
|
||||
use postgres_types::PgLsn;
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::thread;
|
||||
use std::thread::sleep;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
//
|
||||
// We keep one WAL Receiver active per timeline.
|
||||
//
|
||||
struct WalReceiverEntry {
|
||||
wal_producer_connstr: String,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref WAL_RECEIVERS: Mutex<HashMap<ZTimelineId, WalReceiverEntry>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
) {
|
||||
let mut receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
|
||||
match receivers.get_mut(&timelineid) {
|
||||
Some(receiver) => {
|
||||
receiver.wal_producer_connstr = wal_producer_connstr.into();
|
||||
}
|
||||
None => {
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Also launch a new thread to handle this connection
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
thread_main(conf, timelineid);
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Look up current WAL producer connection string in the hash table
|
||||
fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
|
||||
let receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
|
||||
receivers
|
||||
.get(&timelineid)
|
||||
.unwrap()
|
||||
.wal_producer_connstr
|
||||
.clone()
|
||||
}
|
||||
|
||||
//
|
||||
// This is the entry point for the WAL receiver thread.
|
||||
//
|
||||
pub fn thread_main(conf: PageServerConf, wal_producer_connstr: &String) {
|
||||
info!("WAL receiver thread started: '{}'", wal_producer_connstr);
|
||||
fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId) {
|
||||
info!(
|
||||
"WAL receiver thread started for timeline : '{}'",
|
||||
timelineid
|
||||
);
|
||||
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
//
|
||||
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
|
||||
// and start streaming WAL from it. If the connection is lost, keep retrying.
|
||||
//
|
||||
loop {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
|
||||
runtime.block_on(async {
|
||||
loop {
|
||||
let _res = walreceiver_main(conf.clone(), wal_producer_connstr).await;
|
||||
let res = walreceiver_main(conf, timelineid, &wal_producer_connstr);
|
||||
|
||||
// TODO: print/log the error
|
||||
if let Err(e) = res {
|
||||
info!(
|
||||
"WAL streaming connection failed, retrying in 1 second...: {:?}",
|
||||
_res
|
||||
"WAL streaming connection failed ({}), retrying in 1 second",
|
||||
e
|
||||
);
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
sleep(Duration::from_secs(1));
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async fn walreceiver_main(
|
||||
conf: PageServerConf,
|
||||
wal_producer_connstr: &String,
|
||||
fn walreceiver_main(
|
||||
_conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
) -> Result<(), Error> {
|
||||
// Connect to the database in replication mode.
|
||||
debug!("connecting to {}...", wal_producer_connstr);
|
||||
let (mut rclient, connection) = connect_replication(
|
||||
wal_producer_connstr.as_str(),
|
||||
NoTls,
|
||||
ReplicationMode::Physical,
|
||||
)
|
||||
.await?;
|
||||
debug!("connected!");
|
||||
info!("connecting to {:?}", wal_producer_connstr);
|
||||
let connect_cfg = format!(
|
||||
"{} application_name=pageserver replication=true",
|
||||
wal_producer_connstr
|
||||
);
|
||||
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
error!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
let mut rclient = Client::connect(&connect_cfg, NoTls)?;
|
||||
info!("connected!");
|
||||
|
||||
let identify_system = rclient.identify_system().await?;
|
||||
let end_of_wal = u64::from(identify_system.xlogpos());
|
||||
let identify = identify_system(&mut rclient)?;
|
||||
info!("{:?}", identify);
|
||||
let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
|
||||
let mut caught_up = false;
|
||||
|
||||
let sysid: u64 = identify_system.systemid().parse().unwrap();
|
||||
let pcache = page_cache::get_pagecache(conf, sysid);
|
||||
let repository = page_cache::get_repository();
|
||||
let timeline = repository.get_timeline(timelineid).unwrap();
|
||||
|
||||
//
|
||||
// Start streaming the WAL, from where we left off previously.
|
||||
//
|
||||
let mut startpoint = pcache.get_last_valid_lsn();
|
||||
if startpoint == 0 {
|
||||
// If we start here with identify_system.xlogpos() we will have race condition with
|
||||
// postgres start: insert into postgres may request page that was modified with lsn
|
||||
// smaller than identify_system.xlogpos().
|
||||
//
|
||||
// Current procedure for starting postgres will anyway be changed to something
|
||||
// different like having 'initdb' method on a pageserver (or importing some shared
|
||||
// empty database snapshot), so for now I just put start of first segment which
|
||||
// seems to be a valid record.
|
||||
pcache.init_valid_lsn(0x_1_000_000_u64);
|
||||
startpoint = u64::from(0x_1_000_000_u64);
|
||||
} else {
|
||||
// There might be some padding after the last full record, skip it.
|
||||
//
|
||||
// FIXME: It probably would be better to always start streaming from the beginning
|
||||
// of the page, or the segment, so that we could check the page/segment headers
|
||||
// too. Just for the sake of paranoia.
|
||||
if startpoint % 8 != 0 {
|
||||
startpoint += 8 - (startpoint % 8);
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"starting replication from {:X}/{:X}, server is at {:X}/{:X}...",
|
||||
(startpoint >> 32),
|
||||
(startpoint & 0xffffffff),
|
||||
(end_of_wal >> 32),
|
||||
(end_of_wal & 0xffffffff)
|
||||
);
|
||||
let startpoint = tokio_postgres::types::Lsn::from(startpoint);
|
||||
let mut physical_stream = rclient
|
||||
.start_physical_replication(None, startpoint, None)
|
||||
.await?;
|
||||
let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint));
|
||||
// If we had previously received WAL up to some point in the middle of a WAL record, we
|
||||
// better start from the end of last full WAL record, not in the middle of one. Hence,
|
||||
// use 'last_record_lsn' rather than 'last_valid_lsn' here.
|
||||
let mut last_rec_lsn = timeline.get_last_record_lsn();
|
||||
let mut startpoint = last_rec_lsn;
|
||||
|
||||
while let Some(replication_message) = physical_stream.next().await {
|
||||
match replication_message? {
|
||||
if startpoint == Lsn(0) {
|
||||
error!("No previous WAL position");
|
||||
}
|
||||
|
||||
// There might be some padding after the last full record, skip it.
|
||||
//
|
||||
// FIXME: It probably would be better to always start streaming from the beginning
|
||||
// of the page, or the segment, so that we could check the page/segment headers
|
||||
// too. Just for the sake of paranoia.
|
||||
startpoint += startpoint.calc_padding(8u32);
|
||||
|
||||
debug!(
|
||||
"last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
|
||||
last_rec_lsn, startpoint, timelineid, end_of_wal
|
||||
);
|
||||
|
||||
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
|
||||
|
||||
let copy_stream = rclient.copy_both_simple(&query)?;
|
||||
let mut physical_stream = ReplicationIter::new(copy_stream);
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let checkpoint_bytes =
|
||||
timeline.get_page_at_lsn_nowait(ObjectTag::Checkpoint, startpoint, false)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
|
||||
|
||||
while let Some(replication_message) = physical_stream.next()? {
|
||||
let status_update = match replication_message {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
// Pass the WAL data to the decoder, and see if we can decode
|
||||
// more records as a result.
|
||||
let data = xlog_data.data();
|
||||
let startlsn = xlog_data.wal_start();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
let prev_last_rec_lsn = last_rec_lsn;
|
||||
|
||||
trace!(
|
||||
"received XLogData between {:X}/{:X} and {:X}/{:X}",
|
||||
(startlsn >> 32),
|
||||
(startlsn & 0xffffffff),
|
||||
(endlsn >> 32),
|
||||
(endlsn & 0xffffffff)
|
||||
);
|
||||
write_wal_file(startlsn, timelineid, pg_constants::WAL_SEGMENT_SIZE, data)?;
|
||||
|
||||
trace!("received XLogData between {} and {}", startlsn, endlsn);
|
||||
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
loop {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode() {
|
||||
let decoded =
|
||||
crate::waldecoder::decode_wal_record(startlsn, recdata.clone());
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// Save old checkpoint value to compare with it after decoding WAL record
|
||||
let old_checkpoint_bytes = checkpoint.encode();
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
restore_local_repo::save_decoded_record(
|
||||
&mut checkpoint,
|
||||
&*timeline,
|
||||
&decoded,
|
||||
recdata,
|
||||
lsn,
|
||||
)?;
|
||||
last_rec_lsn = lsn;
|
||||
|
||||
// Put the WAL record to the page cache. We make a separate copy of
|
||||
// it for every block it modifies. (The actual WAL record is kept in
|
||||
// a Bytes, which uses a reference counter for the underlying buffer,
|
||||
// so having multiple copies of it doesn't cost that much)
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = BufferTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
blknum: blk.blkno,
|
||||
};
|
||||
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn: lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
};
|
||||
|
||||
pcache.put_wal_record(tag, rec);
|
||||
}
|
||||
|
||||
// Now that this record has been handled, let the page cache know that
|
||||
// it is up-to-date to this LSN
|
||||
pcache.advance_last_valid_lsn(lsn);
|
||||
} else {
|
||||
break;
|
||||
let new_checkpoint_bytes = checkpoint.encode();
|
||||
// Check if checkpoint data was updated by save_decoded_record
|
||||
if new_checkpoint_bytes != old_checkpoint_bytes {
|
||||
timeline.put_page_image(
|
||||
ObjectTag::Checkpoint,
|
||||
lsn,
|
||||
new_checkpoint_bytes,
|
||||
false,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -171,24 +221,256 @@ async fn walreceiver_main(
|
||||
// better reflect that, because GetPage@LSN requests might also point in the
|
||||
// middle of a record, if the request LSN was taken from the server's current
|
||||
// flush ptr.
|
||||
pcache.advance_last_valid_lsn(endlsn);
|
||||
timeline.advance_last_valid_lsn(endlsn);
|
||||
|
||||
// Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
|
||||
// "checkpoint" the repository to flush all the changes from WAL we've processed
|
||||
// so far to disk. After this, we don't need the original WAL anymore, and it
|
||||
// can be removed. This is probably too aggressive for production, but it's useful
|
||||
// to expose bugs now.
|
||||
//
|
||||
// TODO: We don't actually dare to remove the WAL. It's useful for debugging,
|
||||
// and we might it for logical decoding other things in the future. Although
|
||||
// we should also be able to fetch it back from the WAL safekeepers or S3 if
|
||||
// needed.
|
||||
if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
!= last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
{
|
||||
info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
|
||||
let (oldest_segno, newest_segno) = find_wal_file_range(
|
||||
timelineid,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
last_rec_lsn,
|
||||
)?;
|
||||
|
||||
if newest_segno - oldest_segno >= 10 {
|
||||
timeline.checkpoint()?;
|
||||
|
||||
// TODO: This is where we could remove WAL older than last_rec_lsn.
|
||||
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
|
||||
}
|
||||
}
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!(
|
||||
"caught up at LSN {:X}/{:X}",
|
||||
(endlsn >> 32),
|
||||
(endlsn & 0xffffffff)
|
||||
);
|
||||
info!("caught up at LSN {}", endlsn);
|
||||
caught_up = true;
|
||||
}
|
||||
|
||||
Some(endlsn)
|
||||
}
|
||||
|
||||
ReplicationMessage::PrimaryKeepAlive(keepalive) => {
|
||||
let wal_end = keepalive.wal_end();
|
||||
let timestamp = keepalive.timestamp();
|
||||
let reply_requested = keepalive.reply() != 0;
|
||||
|
||||
trace!(
|
||||
"received PrimaryKeepAlive(wal_end: {}, timestamp: {:?} reply: {})",
|
||||
wal_end,
|
||||
timestamp,
|
||||
reply_requested,
|
||||
);
|
||||
|
||||
if reply_requested {
|
||||
Some(timeline.get_last_valid_lsn())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
ReplicationMessage::PrimaryKeepAlive(_keepalive) => {
|
||||
trace!("received PrimaryKeepAlive");
|
||||
// FIXME: Reply, or the connection will time out
|
||||
}
|
||||
_ => (),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
if let Some(last_lsn) = status_update {
|
||||
// TODO: More thought should go into what values are sent here.
|
||||
let last_lsn = PgLsn::from(u64::from(last_lsn));
|
||||
let write_lsn = last_lsn;
|
||||
let flush_lsn = last_lsn;
|
||||
let apply_lsn = PgLsn::from(0);
|
||||
let ts = SystemTime::now();
|
||||
const NO_REPLY: u8 = 0;
|
||||
|
||||
physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_wal_file_range(
|
||||
timeline: ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
written_upto: Lsn,
|
||||
) -> Result<(u64, u64)> {
|
||||
let written_upto_segno = written_upto.segment_number(wal_seg_size);
|
||||
|
||||
let mut oldest_segno = written_upto_segno;
|
||||
let mut newest_segno = written_upto_segno;
|
||||
// Scan the wal directory, and count how many WAL filed we could remove
|
||||
let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
|
||||
for entry in fs::read_dir(wal_dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
|
||||
if path.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let filename = path.file_name().unwrap().to_str().unwrap();
|
||||
|
||||
if IsXLogFileName(filename) {
|
||||
let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
|
||||
|
||||
if segno > written_upto_segno {
|
||||
// that's strange.
|
||||
warn!("there is a WAL file from future at {}", path.display());
|
||||
continue;
|
||||
}
|
||||
|
||||
oldest_segno = min(oldest_segno, segno);
|
||||
newest_segno = max(newest_segno, segno);
|
||||
}
|
||||
}
|
||||
// FIXME: would be good to assert that there are no gaps in the WAL files
|
||||
|
||||
Ok((oldest_segno, newest_segno))
|
||||
}
|
||||
|
||||
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
|
||||
///
|
||||
/// See the [postgres docs] for more details.
|
||||
///
|
||||
/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html
|
||||
#[derive(Debug)]
|
||||
pub struct IdentifySystem {
|
||||
systemid: u64,
|
||||
timeline: u32,
|
||||
xlogpos: PgLsn,
|
||||
dbname: Option<String>,
|
||||
}
|
||||
|
||||
/// There was a problem parsing the response to
|
||||
/// a postgres IDENTIFY_SYSTEM command.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("IDENTIFY_SYSTEM parse error")]
|
||||
pub struct IdentifyError;
|
||||
|
||||
/// Run the postgres `IDENTIFY_SYSTEM` command
|
||||
pub fn identify_system(client: &mut Client) -> Result<IdentifySystem, Error> {
|
||||
let query_str = "IDENTIFY_SYSTEM";
|
||||
let response = client.simple_query(query_str)?;
|
||||
|
||||
// get(N) from row, then parse it as some destination type.
|
||||
fn get_parse<T>(row: &SimpleQueryRow, idx: usize) -> Result<T, IdentifyError>
|
||||
where
|
||||
T: FromStr,
|
||||
{
|
||||
let val = row.get(idx).ok_or(IdentifyError)?;
|
||||
val.parse::<T>().or(Err(IdentifyError))
|
||||
}
|
||||
|
||||
// extract the row contents into an IdentifySystem struct.
|
||||
// written as a closure so I can use ? for Option here.
|
||||
if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) {
|
||||
Ok(IdentifySystem {
|
||||
systemid: get_parse(first_row, 0)?,
|
||||
timeline: get_parse(first_row, 1)?,
|
||||
xlogpos: get_parse(first_row, 2)?,
|
||||
dbname: get_parse(first_row, 3).ok(),
|
||||
})
|
||||
} else {
|
||||
Err(IdentifyError.into())
|
||||
}
|
||||
}
|
||||
|
||||
fn write_wal_file(
|
||||
startpos: Lsn,
|
||||
timeline: ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
buf: &[u8],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut bytes_left: usize = buf.len();
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = start_pos.segment_offset(wal_seg_size);
|
||||
|
||||
while bytes_left != 0 {
|
||||
let bytes_to_write;
|
||||
|
||||
/*
|
||||
* If crossing a WAL boundary, only write up until we reach wal
|
||||
* segment size.
|
||||
*/
|
||||
if xlogoff + bytes_left > wal_seg_size {
|
||||
bytes_to_write = wal_seg_size - xlogoff;
|
||||
} else {
|
||||
bytes_to_write = bytes_left;
|
||||
}
|
||||
|
||||
/* Open file */
|
||||
let segno = start_pos.segment_number(wal_seg_size);
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
segno,
|
||||
wal_seg_size,
|
||||
);
|
||||
let wal_file_path = wal_dir.join(wal_file_name.clone());
|
||||
let wal_file_partial_path = wal_dir.join(wal_file_name.clone() + ".partial");
|
||||
|
||||
{
|
||||
let mut wal_file: File;
|
||||
/* Try to open already completed segment */
|
||||
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
|
||||
wal_file = file;
|
||||
partial = false;
|
||||
} else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) {
|
||||
/* Try to open existed partial file */
|
||||
wal_file = file;
|
||||
partial = true;
|
||||
} else {
|
||||
/* Create and fill new partial file */
|
||||
partial = true;
|
||||
match OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.open(&wal_file_partial_path)
|
||||
{
|
||||
Ok(mut file) => {
|
||||
for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
|
||||
file.write_all(&ZERO_BLOCK)?;
|
||||
}
|
||||
wal_file = file;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
wal_file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?;
|
||||
|
||||
// FIXME: Flush the file
|
||||
//wal_file.sync_all()?;
|
||||
}
|
||||
/* Write was successful, advance our position */
|
||||
bytes_written += bytes_to_write;
|
||||
bytes_left -= bytes_to_write;
|
||||
start_pos += bytes_to_write as u64;
|
||||
xlogoff += bytes_to_write;
|
||||
|
||||
/* Did we reach the end of a WAL segment? */
|
||||
if start_pos.segment_offset(wal_seg_size) == 0 {
|
||||
xlogoff = 0;
|
||||
if partial {
|
||||
fs::rename(&wal_file_partial_path, &wal_file_path)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,183 +1,545 @@
|
||||
//
|
||||
// WAL redo
|
||||
//
|
||||
// We rely on Postgres to perform WAL redo for us. We launch a
|
||||
// postgres process in special "wal redo" mode that's similar to
|
||||
// single-user mode. We then pass the the previous page image, if any,
|
||||
// and all the WAL records we want to apply, to the postgress
|
||||
// process. Then we get the page image back. Communication with the
|
||||
// postgres process happens via stdin/stdout
|
||||
//
|
||||
// See src/backend/tcop/zenith_wal_redo.c for the other side of
|
||||
// this communication.
|
||||
//
|
||||
// TODO: Even though the postgres code runs in a separate process,
|
||||
// it's not a secure sandbox.
|
||||
//
|
||||
//!
|
||||
//! WAL redo. This service runs PostgreSQL in a special wal_redo mode
|
||||
//! to apply given WAL records over an old page image and return new page image.
|
||||
//!
|
||||
//! We rely on Postgres to perform WAL redo for us. We launch a
|
||||
//! postgres process in special "wal redo" mode that's similar to
|
||||
//! single-user mode. We then pass the previous page image, if any,
|
||||
//! and all the WAL records we want to apply, to the postgres
|
||||
//! process. Then we get the page image back. Communication with the
|
||||
//! postgres process happens via stdin/stdout
|
||||
//!
|
||||
//! See src/backend/tcop/zenith_wal_redo.c for the other side of
|
||||
//! this communication.
|
||||
//!
|
||||
//! TODO: Even though the postgres code runs in a separate process,
|
||||
//! it's not a secure sandbox.
|
||||
//!
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use std::assert;
|
||||
use std::cell::RefCell;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::Error;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Stdio;
|
||||
use std::sync::mpsc;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use std::{path::PathBuf, process::Stdio};
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::process::{Child, ChildStdin, ChildStdout, Command};
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::process::{ChildStdin, ChildStdout, Command};
|
||||
use tokio::time::timeout;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use crate::object_key::*;
|
||||
use crate::repository::BufferTag;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder::XlXactParsedRecord;
|
||||
use crate::waldecoder::{MultiXactId, XlMultiXactCreate};
|
||||
use crate::PageServerConf;
|
||||
use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::XLogRecord;
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::CacheEntry;
|
||||
use crate::page_cache::WALRecord;
|
||||
use crate::{page_cache::BufferTag, PageServerConf};
|
||||
///
|
||||
/// WAL Redo Manager is responsible for replaying WAL records.
|
||||
///
|
||||
/// Callers use the WAL redo manager through this abstract interface,
|
||||
/// which makes it easy to mock it in tests.
|
||||
pub trait WalRedoManager: Send + Sync {
|
||||
/// Apply some WAL records.
|
||||
///
|
||||
/// The caller passes an old page image, and WAL records that should be
|
||||
/// applied over it. The return value is a new page image, after applying
|
||||
/// the reords.
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: ObjectTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError>;
|
||||
}
|
||||
|
||||
///
|
||||
/// A dummy WAL Redo Manager implementation that doesn't allow replaying
|
||||
/// anything. Currently used during bootstrapping (zenith init), to create
|
||||
/// a Repository object without launching the real WAL redo process.
|
||||
///
|
||||
pub struct DummyRedoManager {}
|
||||
impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
_tag: ObjectTag,
|
||||
_lsn: Lsn,
|
||||
_base_img: Option<Bytes>,
|
||||
_records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
Err(WalRedoError::InvalidState)
|
||||
}
|
||||
}
|
||||
|
||||
static TIMEOUT: Duration = Duration::from_secs(20);
|
||||
|
||||
//
|
||||
// Main entry point for the WAL applicator thread.
|
||||
//
|
||||
pub fn wal_redo_main(conf: PageServerConf, sys_id: u64) {
|
||||
info!("WAL redo thread started {}", sys_id);
|
||||
///
|
||||
/// The implementation consists of two parts: PostgresRedoManager, and
|
||||
/// PostgresRedoManagerInternal. PostgresRedoManager is the public struct
|
||||
/// that can be used to send redo requests to the manager.
|
||||
/// PostgresRedoManagerInternal is used by the manager thread itself.
|
||||
///
|
||||
pub struct PostgresRedoManager {
|
||||
request_tx: Mutex<mpsc::Sender<WalRedoRequest>>,
|
||||
}
|
||||
|
||||
// We block on waiting for requests on the walredo request channel, but
|
||||
// use async I/O to communicate with the child process. Initialize the
|
||||
// runtime for the async part.
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
struct PostgresRedoManagerInternal {
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf.clone(), sys_id);
|
||||
request_rx: mpsc::Receiver<WalRedoRequest>,
|
||||
}
|
||||
|
||||
// Loop forever, handling requests as they come.
|
||||
let walredo_channel_receiver = &pcache.walredo_receiver;
|
||||
loop {
|
||||
let mut process: WalRedoProcess;
|
||||
let datadir = conf.data_dir.join(format!("wal-redo/{}", sys_id));
|
||||
#[derive(Debug)]
|
||||
struct WalRedoRequestData {
|
||||
tag: ObjectTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
}
|
||||
|
||||
info!("launching WAL redo postgres process {}", sys_id);
|
||||
{
|
||||
let _guard = runtime.enter();
|
||||
process = WalRedoProcess::launch(&datadir, &runtime).unwrap();
|
||||
#[derive(Debug)]
|
||||
struct WalRedoRequest {
|
||||
data: WalRedoRequestData,
|
||||
response_channel: mpsc::Sender<Result<Bytes, WalRedoError>>,
|
||||
}
|
||||
|
||||
/// An error happened in WAL redo
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum WalRedoError {
|
||||
#[error(transparent)]
|
||||
IoError(#[from] std::io::Error),
|
||||
|
||||
#[error("cannot perform WAL redo now")]
|
||||
InvalidState,
|
||||
}
|
||||
|
||||
///
|
||||
/// Public interface of WAL redo manager
|
||||
///
|
||||
impl PostgresRedoManager {
|
||||
///
|
||||
/// Create a new PostgresRedoManager.
|
||||
///
|
||||
/// This launches a new thread to handle the requests.
|
||||
pub fn new(conf: &'static PageServerConf) -> PostgresRedoManager {
|
||||
let (tx, rx) = mpsc::channel();
|
||||
|
||||
//
|
||||
// Launch the WAL redo thread
|
||||
//
|
||||
// Get mutable references to the values that we need to pass to the
|
||||
// thread.
|
||||
let request_rx = rx;
|
||||
|
||||
// Currently, the join handle is not saved anywhere and we
|
||||
// won't try restart the thread if it dies.
|
||||
let _walredo_thread = std::thread::Builder::new()
|
||||
.name("WAL redo thread".into())
|
||||
.spawn(move || {
|
||||
let mut internal = PostgresRedoManagerInternal { conf, request_rx };
|
||||
internal.wal_redo_main();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
PostgresRedoManager {
|
||||
request_tx: Mutex::new(tx),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pretty arbitrarily, reuse the same Postgres process for 100 requests.
|
||||
// After that, kill it and start a new one. This is mostly to avoid
|
||||
// using up all shared buffers in Postgres's shared buffer cache; we don't
|
||||
// want to write any pages to disk in the WAL redo process.
|
||||
for _i in 1..100 {
|
||||
let request = walredo_channel_receiver.recv().unwrap();
|
||||
impl WalRedoManager for PostgresRedoManager {
|
||||
///
|
||||
/// Request the WAL redo manager to apply some WAL records
|
||||
///
|
||||
/// The WAL redo is handled by a separate thread, so this just sends a request
|
||||
/// to the thread and waits for response.
|
||||
///
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: ObjectTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
// Create a channel where to receive the response
|
||||
let (tx, rx) = mpsc::channel::<Result<Bytes, WalRedoError>>();
|
||||
|
||||
let result = handle_apply_request(&pcache, &process, &runtime, request);
|
||||
if result.is_err() {
|
||||
// On error, kill the process.
|
||||
break;
|
||||
let request = WalRedoRequest {
|
||||
data: WalRedoRequestData {
|
||||
tag,
|
||||
lsn,
|
||||
base_img,
|
||||
records,
|
||||
},
|
||||
response_channel: tx,
|
||||
};
|
||||
|
||||
self.request_tx
|
||||
.lock()
|
||||
.unwrap()
|
||||
.send(request)
|
||||
.expect("could not send WAL redo request");
|
||||
|
||||
rx.recv()
|
||||
.expect("could not receive response to WAL redo request")
|
||||
}
|
||||
}
|
||||
|
||||
fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
|
||||
((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
|
||||
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
|
||||
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
|
||||
}
|
||||
|
||||
fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
|
||||
(xid as u16) % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP
|
||||
* pg_constants::MXACT_MEMBER_BITS_PER_XACT
|
||||
}
|
||||
|
||||
/* Location (byte offset within page) of TransactionId of given member */
|
||||
fn mx_offset_to_member_offset(xid: MultiXactId) -> usize {
|
||||
mx_offset_to_flags_offset(xid)
|
||||
+ (pg_constants::MULTIXACT_FLAGBYTES_PER_GROUP
|
||||
+ (xid as u16 % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP) * 4) as usize
|
||||
}
|
||||
|
||||
///
|
||||
/// WAL redo thread
|
||||
///
|
||||
impl PostgresRedoManagerInternal {
|
||||
//
|
||||
// Main entry point for the WAL applicator thread.
|
||||
//
|
||||
fn wal_redo_main(&mut self) {
|
||||
info!("WAL redo thread started");
|
||||
|
||||
// We block on waiting for requests on the walredo request channel, but
|
||||
// use async I/O to communicate with the child process. Initialize the
|
||||
// runtime for the async part.
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let processes: Vec<PostgresRedoProcess>;
|
||||
|
||||
info!("launching WAL redo postgres process");
|
||||
|
||||
let wal_redoers = self.conf.wal_redoers;
|
||||
processes = (0..wal_redoers)
|
||||
.map(|i| {
|
||||
runtime
|
||||
.block_on(PostgresRedoProcess::launch(self.conf, i))
|
||||
.unwrap()
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Loop forever, handling requests as they come.
|
||||
loop {
|
||||
let mut requests: Vec<WalRedoRequest> = Vec::new();
|
||||
requests.push(
|
||||
self.request_rx
|
||||
.recv()
|
||||
.expect("WAL redo request channel was closed"),
|
||||
);
|
||||
loop {
|
||||
let req = self.request_rx.try_recv();
|
||||
match req {
|
||||
Ok(req) => requests.push(req),
|
||||
Err(_) => break,
|
||||
}
|
||||
}
|
||||
let request_data = requests.iter().map(|req| &req.data);
|
||||
let mut rr = 0; // round robin
|
||||
let results = runtime.block_on(async {
|
||||
let futures = request_data.map(|req| {
|
||||
rr += 1;
|
||||
self.handle_apply_request(&processes[rr % wal_redoers], &req)
|
||||
});
|
||||
let mut results: Vec<Result<Bytes, WalRedoError>> = Vec::new();
|
||||
for future in futures {
|
||||
results.push(future.await);
|
||||
}
|
||||
results
|
||||
});
|
||||
for (result, request) in results.into_iter().zip(requests.iter()) {
|
||||
let result_ok = result.is_ok();
|
||||
|
||||
// Send the result to the requester
|
||||
let _ = request.response_channel.send(result);
|
||||
|
||||
if !result_ok {
|
||||
error!("wal-redo-postgres failed to apply request {:?}", request);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("killing WAL redo postgres process");
|
||||
let _ = runtime.block_on(process.stdin.get_mut().shutdown());
|
||||
let mut child = process.child;
|
||||
drop(process.stdin);
|
||||
let _ = runtime.block_on(child.wait());
|
||||
async fn handle_apply_request(
|
||||
&self,
|
||||
process: &PostgresRedoProcess,
|
||||
request: &WalRedoRequestData,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let tag = request.tag;
|
||||
let lsn = request.lsn;
|
||||
let base_img = request.base_img.clone();
|
||||
let records = &request.records;
|
||||
|
||||
let nrecords = records.len();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
if let ObjectTag::RelationBuffer(buf_tag) = tag {
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
|
||||
} else {
|
||||
// Non-relational WAL records we apply ourselves.
|
||||
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let mut page = BytesMut::new();
|
||||
if let Some(fpi) = base_img {
|
||||
// If full-page image is provided, then use it...
|
||||
page.extend_from_slice(&fpi[..]);
|
||||
} else {
|
||||
// otherwise initialize page with zeros
|
||||
page.extend_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
// Apply all collected WAL records
|
||||
for record in records {
|
||||
let mut buf = record.rec.clone();
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
// FIXME: refactor to avoid code duplication.
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
|
||||
//move to main data
|
||||
// TODO probably, we should store some records in our special format
|
||||
// to avoid this weird parsing on replay
|
||||
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
|
||||
if buf.remaining() > skip {
|
||||
buf.advance(skip);
|
||||
}
|
||||
|
||||
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
// The only operation we need to implement is CLOG_ZEROPAGE
|
||||
page.copy_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
// Transaction manager stuff
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let tag_blknum = match tag {
|
||||
ObjectTag::Clog(slru) => slru.blknum,
|
||||
ObjectTag::TwoPhase(_) => {
|
||||
assert!(info == pg_constants::XLOG_XACT_PREPARE);
|
||||
trace!("Apply prepare {} record", xlogrec.xl_xid);
|
||||
page.clear();
|
||||
page.extend_from_slice(&buf[..]);
|
||||
continue;
|
||||
}
|
||||
_ => panic!("Not valid XACT object tag {:?}", tag),
|
||||
};
|
||||
let parsed_xact =
|
||||
XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
|
||||
if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag_blknum == blkno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_SUB_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag_blknum == blkno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
// Multiexact operations
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
|
||||
|| info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
|
||||
{
|
||||
// Just need to zero page
|
||||
page.copy_from_slice(&ZERO_PAGE);
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
if let ObjectTag::MultiXactMembers(slru) = tag {
|
||||
for i in 0..xlrec.nmembers {
|
||||
let blkno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
if blkno == slru.blknum {
|
||||
// update only target block
|
||||
let offset = xlrec.moff + i;
|
||||
let memberoff = mx_offset_to_member_offset(offset);
|
||||
let flagsoff = mx_offset_to_flags_offset(offset);
|
||||
let bshift = mx_offset_to_flags_bitshift(offset);
|
||||
let mut flagsval =
|
||||
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
flagsval &=
|
||||
!(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
|
||||
<< bshift);
|
||||
flagsval |= xlrec.members[i as usize].status << bshift;
|
||||
LittleEndian::write_u32(
|
||||
&mut page[flagsoff..flagsoff + 4],
|
||||
flagsval,
|
||||
);
|
||||
LittleEndian::write_u32(
|
||||
&mut page[memberoff..memberoff + 4],
|
||||
xlrec.members[i as usize].xid,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Multixact offsets SLRU
|
||||
let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
|
||||
* 4) as usize;
|
||||
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
|
||||
}
|
||||
} else {
|
||||
panic!();
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
// Ralation map file has size 512 bytes
|
||||
page.clear();
|
||||
page.extend_from_slice(&buf[12..]); // skip xl_relmap_update
|
||||
assert!(page.len() == 512); // size of pg_filenode.map
|
||||
}
|
||||
}
|
||||
|
||||
apply_result = Ok::<Bytes, Error>(page.freeze());
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
|
||||
let result: Result<Bytes, WalRedoError>;
|
||||
|
||||
debug!(
|
||||
"applied {} WAL records in {} ms to reconstruct page image at LSN {}",
|
||||
nrecords,
|
||||
duration.as_millis(),
|
||||
lsn
|
||||
);
|
||||
|
||||
if let Err(e) = apply_result {
|
||||
error!("could not apply WAL records: {}", e);
|
||||
result = Err(WalRedoError::IoError(e));
|
||||
} else {
|
||||
let img = apply_result.unwrap();
|
||||
|
||||
result = Ok(img);
|
||||
}
|
||||
|
||||
// The caller is responsible for sending the response
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_apply_request(
|
||||
pcache: &page_cache::PageCache,
|
||||
process: &WalRedoProcess,
|
||||
runtime: &Runtime,
|
||||
entry_rc: Arc<CacheEntry>,
|
||||
) -> Result<(), Error> {
|
||||
let tag = entry_rc.key.tag;
|
||||
let lsn = entry_rc.key.lsn;
|
||||
let (base_img, records) = pcache.collect_records_for_apply(entry_rc.as_ref());
|
||||
|
||||
let mut entry = entry_rc.content.lock().unwrap();
|
||||
entry.apply_pending = false;
|
||||
|
||||
let nrecords = records.len();
|
||||
|
||||
let start = Instant::now();
|
||||
let apply_result = process.apply_wal_records(runtime, tag, base_img, records);
|
||||
let duration = start.elapsed();
|
||||
|
||||
let result;
|
||||
|
||||
debug!(
|
||||
"applied {} WAL records in {} ms to reconstruct page image at LSN {:X}/{:X}",
|
||||
nrecords,
|
||||
duration.as_millis(),
|
||||
lsn >> 32,
|
||||
lsn & 0xffff_ffff
|
||||
);
|
||||
|
||||
if let Err(e) = apply_result {
|
||||
error!("could not apply WAL records: {}", e);
|
||||
result = Err(e);
|
||||
} else {
|
||||
entry.page_image = Some(apply_result.unwrap());
|
||||
pcache
|
||||
.num_page_images
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
result = Ok(());
|
||||
}
|
||||
|
||||
// Wake up the requester, whether the operation succeeded or not.
|
||||
entry_rc.walredo_condvar.notify_all();
|
||||
|
||||
return result;
|
||||
struct PostgresRedoProcess {
|
||||
stdin: Arc<RefCell<ChildStdin>>,
|
||||
stdout: Arc<RefCell<ChildStdout>>,
|
||||
}
|
||||
|
||||
struct WalRedoProcess {
|
||||
child: Child,
|
||||
stdin: RefCell<ChildStdin>,
|
||||
stdout: RefCell<ChildStdout>,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
impl PostgresRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
// Tests who run pageserver binary are setting proper PG_BIN_DIR
|
||||
// and PG_LIB_DIR so that WalRedo would start right postgres. We may later
|
||||
// switch to setting same things in pageserver config file.
|
||||
fn launch(datadir: &PathBuf, runtime: &Runtime) -> Result<WalRedoProcess, Error> {
|
||||
// Create empty data directory for wal-redo postgres deleting old one.
|
||||
fs::remove_dir_all(datadir.to_str().unwrap()).ok();
|
||||
let initdb = runtime
|
||||
.block_on(
|
||||
Command::new("initdb")
|
||||
.args(&["-D", datadir.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.output(),
|
||||
)
|
||||
async fn launch(conf: &PageServerConf, id: usize) -> Result<PostgresRedoProcess, Error> {
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
let datadir = conf.workdir.join(format!("wal-redo-datadir-{}", id));
|
||||
|
||||
// Create empty data directory for wal-redo postgres, deleting old one first.
|
||||
if datadir.exists() {
|
||||
info!("directory {:?} exists, removing", &datadir);
|
||||
if let Err(e) = fs::remove_dir_all(&datadir) {
|
||||
error!("could not remove old wal-redo-datadir: {:?}", e);
|
||||
}
|
||||
}
|
||||
info!("running initdb in {:?}", datadir.display());
|
||||
let initdb = Command::new(conf.pg_bin_dir().join("initdb"))
|
||||
.args(&["-D", datadir.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.output()
|
||||
.await
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.status.success() {
|
||||
panic!("initdb failed: {}\nstderr:\n{}",
|
||||
std::str::from_utf8(&initdb.stdout).unwrap(),
|
||||
std::str::from_utf8(&initdb.stderr).unwrap());
|
||||
panic!(
|
||||
"initdb failed: {}\nstderr:\n{}",
|
||||
std::str::from_utf8(&initdb.stdout).unwrap(),
|
||||
std::str::from_utf8(&initdb.stderr).unwrap()
|
||||
);
|
||||
} else {
|
||||
// Limit shared cache for wal-redo-postres
|
||||
let mut config = OpenOptions::new()
|
||||
.append(true)
|
||||
.open(PathBuf::from(&datadir).join("postgresql.conf"))?;
|
||||
config.write_all(b"shared_buffers=128kB\n")?;
|
||||
config.write_all(b"fsync=off\n")?;
|
||||
config.write_all(b"shared_preload_libraries=zenith\n")?;
|
||||
config.write_all(b"zenith.wal_redo=on\n")?;
|
||||
}
|
||||
|
||||
// Start postgres itself
|
||||
let mut child = Command::new("postgres")
|
||||
let mut child = Command::new(conf.pg_bin_dir().join("postgres"))
|
||||
.arg("--wal-redo")
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env("PGDATA", datadir.to_str().unwrap())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGDATA", &datadir)
|
||||
.spawn()
|
||||
.expect("postgres --wal-redo command failed to start");
|
||||
|
||||
info!(
|
||||
"launched WAL redo postgres process on {}",
|
||||
datadir.to_str().unwrap()
|
||||
"launched WAL redo postgres process on {:?}",
|
||||
datadir.display()
|
||||
);
|
||||
|
||||
let stdin = child.stdin.take().expect("failed to open child's stdin");
|
||||
@@ -198,17 +560,16 @@ impl WalRedoProcess {
|
||||
if res.unwrap() == 0 {
|
||||
break;
|
||||
}
|
||||
debug!("wal-redo-postgres: {}", line.trim());
|
||||
error!("wal-redo-postgres: {}", line.trim());
|
||||
line.clear();
|
||||
}
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
tokio::spawn(f_stderr);
|
||||
|
||||
Ok(WalRedoProcess {
|
||||
child: child,
|
||||
stdin: RefCell::new(stdin),
|
||||
stdout: RefCell::new(stdout),
|
||||
Ok(PostgresRedoProcess {
|
||||
stdin: Arc::new(RefCell::new(stdin)),
|
||||
stdout: Arc::new(RefCell::new(stdout)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -216,146 +577,155 @@ impl WalRedoProcess {
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
fn apply_wal_records(
|
||||
async fn apply_wal_records(
|
||||
&self,
|
||||
runtime: &Runtime,
|
||||
tag: BufferTag,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, Error> {
|
||||
records: &[WALRecord],
|
||||
) -> Result<Bytes, std::io::Error> {
|
||||
let mut stdin = self.stdin.borrow_mut();
|
||||
let mut stdout = self.stdout.borrow_mut();
|
||||
return runtime.block_on(async {
|
||||
//
|
||||
// This async block sends all the commands to the process.
|
||||
//
|
||||
// For reasons I don't understand, this needs to be a "move" block;
|
||||
// otherwise the stdin pipe doesn't get closed, despite the shutdown()
|
||||
// call.
|
||||
//
|
||||
let f_stdin = async {
|
||||
// Send base image, if any. (If the record initializes the page, previous page
|
||||
// version is not needed.)
|
||||
|
||||
// We do three things simultaneously: send the old base image and WAL records to
|
||||
// the child process's stdin, read the result from child's stdout, and forward any logging
|
||||
// information that the child writes to its stderr to the page server's log.
|
||||
//
|
||||
// 'f_stdin' handles writing the base image and WAL records to the child process.
|
||||
// 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
|
||||
// tokio runtime in the 'launch' function already, forwards the logging.
|
||||
let f_stdin = async {
|
||||
// Send base image, if any. (If the record initializes the page, previous page
|
||||
// version is not needed.)
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
)
|
||||
.await??;
|
||||
if base_img.is_some() {
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
|
||||
)
|
||||
.await??;
|
||||
if base_img.is_some() {
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
|
||||
)
|
||||
.await??;
|
||||
}
|
||||
}
|
||||
|
||||
// Send WAL records.
|
||||
for rec in records.iter() {
|
||||
let r = rec.clone();
|
||||
// Send WAL records.
|
||||
for rec in records.iter() {
|
||||
let r = rec.clone();
|
||||
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(r.lsn, r.rec))
|
||||
.await?;
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(r.lsn, r.rec))
|
||||
.await?;
|
||||
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
// r.lsn >> 32, r.lsn & 0xffff_ffff);
|
||||
}
|
||||
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
|
||||
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
// r.lsn >> 32, r.lsn & 0xffff_ffff);
|
||||
}
|
||||
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
|
||||
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
|
||||
|
||||
// Send GetPage command to get the result back
|
||||
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
|
||||
timeout(TIMEOUT, stdin.flush()).await??;
|
||||
//debug!("sent GetPage for {}", tag.blknum);
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
// Send GetPage command to get the result back
|
||||
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
|
||||
timeout(TIMEOUT, stdin.flush()).await??;
|
||||
//debug!("sent GetPage for {}", tag.blknum);
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
|
||||
// Read back new page image
|
||||
let f_stdout = async {
|
||||
let mut buf = [0u8; 8192];
|
||||
// Read back new page image
|
||||
let f_stdout = async {
|
||||
let mut buf = [0u8; 8192];
|
||||
|
||||
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
|
||||
//debug!("got response for {}", tag.blknum);
|
||||
Ok::<[u8; 8192], Error>(buf)
|
||||
};
|
||||
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
|
||||
//debug!("got response for {}", tag.blknum);
|
||||
Ok::<[u8; 8192], Error>(buf)
|
||||
};
|
||||
|
||||
// Kill the process. This closes its stdin, which should signal the process
|
||||
// to terminate. TODO: SIGKILL if needed
|
||||
//child.wait();
|
||||
let res = tokio::try_join!(f_stdout, f_stdin)?;
|
||||
|
||||
let res = futures::try_join!(f_stdout, f_stdin)?;
|
||||
let buf = res.0;
|
||||
|
||||
let buf = res.0;
|
||||
|
||||
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
|
||||
});
|
||||
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
|
||||
}
|
||||
}
|
||||
|
||||
// Functions for constructing messages to send to the postgres WAL redo
|
||||
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 5 * 4;
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8('B' as u8);
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u32(tag.spcnode);
|
||||
buf.put_u32(tag.dbnode);
|
||||
buf.put_u32(tag.relnode);
|
||||
buf.put_u32(tag.forknum as u32);
|
||||
buf.put_u32(tag.blknum);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
// FIXME: this is a temporary hack that should go away when we refactor
|
||||
// the postgres protocol serialization + handlers.
|
||||
//
|
||||
// BytesMut is a dynamic growable buffer, used a lot in tokio code but
|
||||
// not in the std library. To write to a BytesMut from a serde serializer,
|
||||
// we need to either:
|
||||
// - pre-allocate the required buffer space. This is annoying because we
|
||||
// shouldn't care what the exact serialized size is-- that's the
|
||||
// serializer's job.
|
||||
// - Or, we need to create a temporary "writer" (which implements the
|
||||
// `Write` trait). It's a bit awkward, because the writer consumes the
|
||||
// underlying BytesMut, and we need to extract it later with
|
||||
// `into_inner`.
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
return buf.freeze();
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
assert!(base_img.len() == 8192);
|
||||
|
||||
let len = 4 + 5 * 4 + base_img.len();
|
||||
let len = 4 + 1 + 4 * 4 + base_img.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8('P' as u8);
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u32(tag.spcnode);
|
||||
buf.put_u32(tag.dbnode);
|
||||
buf.put_u32(tag.relnode);
|
||||
buf.put_u32(tag.forknum as u32);
|
||||
buf.put_u32(tag.blknum);
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let mut buf = writer.into_inner();
|
||||
buf.put(base_img);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
return buf.freeze();
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(endlsn: u64, rec: Bytes) -> Bytes {
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
|
||||
let len = 4 + 8 + rec.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8('A' as u8);
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u64(endlsn);
|
||||
buf.put_u64(endlsn.0);
|
||||
buf.put(rec);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
return buf.freeze();
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 5 * 4;
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8('G' as u8);
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u32(tag.spcnode);
|
||||
buf.put_u32(tag.dbnode);
|
||||
buf.put_u32(tag.relnode);
|
||||
buf.put_u32(tag.forknum as u32);
|
||||
buf.put_u32(tag.blknum);
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
return buf.freeze();
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
29
pgbuild.sh
29
pgbuild.sh
@@ -1,29 +0,0 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Purpose of this script is to build and install postgres in a local directory
|
||||
# so that zenith intergation tests would find pg binaries and support files.
|
||||
#
|
||||
# ./pgbuild.sh would do following:
|
||||
#
|
||||
# 1) run out-of-source build of postgres in REPO_ROOT/tmp_install/build directory (I'm reusing
|
||||
# tmp_install path here since it is already present in .gitignore)
|
||||
#
|
||||
# 2) installs postgres to REPO_ROOT/tmp_install/
|
||||
#
|
||||
REPO_ROOT=$(dirname "$0")
|
||||
REPO_ROOT="`( cd \"$REPO_ROOT\" && pwd )`"
|
||||
|
||||
# configure
|
||||
echo "Configuring postgres build"
|
||||
mkdir -p $REPO_ROOT/tmp_install/build
|
||||
cd $REPO_ROOT/tmp_install/build
|
||||
../../vendor/postgres/configure CFLAGS='-O0' --enable-debug --enable-cassert \
|
||||
--enable-depend --with-libxml --prefix=/ > configure.log
|
||||
|
||||
# compile
|
||||
echo "Compiling postgres"
|
||||
make -j8 -s
|
||||
export DESTDIR=$REPO_ROOT/tmp_install
|
||||
|
||||
echo "Installing postgres to $DESTDIR"
|
||||
make install -s
|
||||
25
postgres_ffi/Cargo.toml
Normal file
25
postgres_ffi/Cargo.toml
Normal file
@@ -0,0 +1,25 @@
|
||||
[package]
|
||||
name = "postgres_ffi"
|
||||
version = "0.1.0"
|
||||
authors = ["Heikki Linnakangas <heikki@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
hex = "0.4.3"
|
||||
lazy_static = "1.4"
|
||||
log = "0.4.14"
|
||||
memoffset = "0.6.2"
|
||||
thiserror = "1.0"
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
[build-dependencies]
|
||||
bindgen = "0.57"
|
||||
25
postgres_ffi/README
Normal file
25
postgres_ffi/README
Normal file
@@ -0,0 +1,25 @@
|
||||
This module contains utilities for working with PostgreSQL file
|
||||
formats. It's a collection of structs that are auto-generated from the
|
||||
PostgreSQL header files using bindgen, and Rust functions to read and
|
||||
manipulate them.
|
||||
|
||||
There are also a bunch of constants in `pg_constants.rs` that are copied
|
||||
from various PostgreSQL headers, rather than auto-generated. They mostly
|
||||
should be auto-generated too, but that's a TODO.
|
||||
|
||||
The PostgreSQL on-disk file format is not portable across different
|
||||
CPU architectures and operating systems. It is also subject to change
|
||||
in each major PostgreSQL version. Currently, this module is based on
|
||||
PostgreSQL v14, but in the future we will probably need a separate
|
||||
copy for each PostgreSQL version.
|
||||
|
||||
To interact with the C structs, there is some unsafe code in this
|
||||
module. Do not copy-paste that to the rest of the codebase! Keep the
|
||||
amount of unsafe code to a minimum, and limited to this module only,
|
||||
and only where it's truly needed.
|
||||
|
||||
TODO: Currently, there is also some code that deals with WAL records
|
||||
in pageserver/src/waldecoder.rs. That should be moved into this
|
||||
module. The rest of the codebase should not have intimate knowledge of
|
||||
PostgreSQL file formats or WAL layout, that knowledge should be
|
||||
encapsulated in this module.
|
||||
57
postgres_ffi/build.rs
Normal file
57
postgres_ffi/build.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
extern crate bindgen;
|
||||
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn main() {
|
||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||
println!("cargo:rerun-if-changed=pg_control_ffi.h");
|
||||
|
||||
// The bindgen::Builder is the main entry point
|
||||
// to bindgen, and lets you build up options for
|
||||
// the resulting bindings.
|
||||
let bindings = bindgen::Builder::default()
|
||||
//
|
||||
// All the needed PostgreSQL headers are included from 'pg_control_ffi.h'
|
||||
//
|
||||
.header("pg_control_ffi.h")
|
||||
//
|
||||
// Tell cargo to invalidate the built crate whenever any of the
|
||||
// included header files changed.
|
||||
//
|
||||
.parse_callbacks(Box::new(bindgen::CargoCallbacks))
|
||||
//
|
||||
// These are the types and constants that we want to generate bindings for
|
||||
//
|
||||
.whitelist_type("ControlFileData")
|
||||
.whitelist_type("CheckPoint")
|
||||
.whitelist_type("FullTransactionId")
|
||||
.whitelist_type("XLogRecord")
|
||||
.whitelist_type("XLogPageHeaderData")
|
||||
.whitelist_type("XLogLongPageHeaderData")
|
||||
.whitelist_var("XLOG_PAGE_MAGIC")
|
||||
.whitelist_var("PG_CONTROL_FILE_SIZE")
|
||||
.whitelist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
|
||||
.whitelist_type("DBState")
|
||||
//
|
||||
// Path the server include dir. It is in tmp_install/include/server, if you did
|
||||
// "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
|
||||
// and used DESTDIR to move it into tmp_install, then it's in
|
||||
// tmp_install/include/postgres/server
|
||||
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
|
||||
// but this will do for now.
|
||||
//
|
||||
.clang_arg("-I../tmp_install/include/server")
|
||||
.clang_arg("-I../tmp_install/include/postgresql/server")
|
||||
//
|
||||
// Finish the builder and generate the bindings.
|
||||
//
|
||||
.generate()
|
||||
.expect("Unable to generate bindings");
|
||||
|
||||
// Write the bindings to the $OUT_DIR/bindings.rs file.
|
||||
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
|
||||
bindings
|
||||
.write_to_file(out_path.join("bindings.rs"))
|
||||
.expect("Couldn't write bindings!");
|
||||
}
|
||||
10
postgres_ffi/pg_control_ffi.h
Normal file
10
postgres_ffi/pg_control_ffi.h
Normal file
@@ -0,0 +1,10 @@
|
||||
/*
|
||||
* This header file is the input to bindgen. It includes all the
|
||||
* PostgreSQL headers that we need to auto-generate Rust structs
|
||||
* from. If you need to expose a new struct to Rust code, add the
|
||||
* header here, and whitelist the struct in the build.rs file.
|
||||
*/
|
||||
#include "c.h"
|
||||
#include "catalog/pg_control.h"
|
||||
#include "access/xlog_internal.h"
|
||||
|
||||
124
postgres_ffi/src/controlfile_utils.rs
Normal file
124
postgres_ffi/src/controlfile_utils.rs
Normal file
@@ -0,0 +1,124 @@
|
||||
//!
|
||||
//! Utilities for reading and writing the PostgreSQL control file.
|
||||
//!
|
||||
//! The PostgreSQL control file is one the first things that the PostgreSQL
|
||||
//! server reads when it starts up. It indicates whether the server was shut
|
||||
//! down cleanly, or if it crashed or was restored from online backup so that
|
||||
//! WAL recovery needs to be performed. It also contains a copy of the latest
|
||||
//! checkpoint record and its location in the WAL.
|
||||
//!
|
||||
//! The control file also contains fields for detecting whether the
|
||||
//! data directory is compatible with a postgres binary. That includes
|
||||
//! a version number, configuration options that can be set at
|
||||
//! compilation time like the block size, and the platform's alignment
|
||||
//! and endianess information. (The PostgreSQL on-disk file format is
|
||||
//! not portable across platforms.)
|
||||
//!
|
||||
//! The control file is stored in the PostgreSQL data directory, as
|
||||
//! `global/pg_control`. The data stored in it is designed to be smaller than
|
||||
//! 512 bytes, on the assumption that it can be updated atomically. The actual
|
||||
//! file is larger, 8192 bytes, but the rest of it is just filled with zeros.
|
||||
//!
|
||||
//! See src/include/catalog/pg_control.h in the PostgreSQL sources for more
|
||||
//! information. You can use PostgreSQL's pg_controldata utility to view its
|
||||
//! contents.
|
||||
//!
|
||||
use crate::{ControlFileData, PG_CONTROL_FILE_SIZE};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
|
||||
/// Equivalent to sizeof(ControlFileData) in C
|
||||
const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
|
||||
|
||||
impl ControlFileData {
|
||||
/// Compute the offset of the `crc` field within the `ControlFileData` struct.
|
||||
/// Equivalent to offsetof(ControlFileData, crc) in C.
|
||||
// Someday this can be const when the right compiler features land.
|
||||
fn pg_control_crc_offset() -> usize {
|
||||
memoffset::offset_of!(ControlFileData, crc)
|
||||
}
|
||||
|
||||
///
|
||||
/// Interpret a slice of bytes as a Postgres control file.
|
||||
///
|
||||
pub fn decode(buf: &[u8]) -> Result<ControlFileData> {
|
||||
// Check that the slice has the expected size. The control file is
|
||||
// padded with zeros up to a 512 byte sector size, so accept a
|
||||
// larger size too, so that the caller can just the whole file
|
||||
// contents without knowing the exact size of the struct.
|
||||
if buf.len() < SIZEOF_CONTROLDATA {
|
||||
bail!("control file is too short");
|
||||
}
|
||||
|
||||
// Compute the expected CRC of the content.
|
||||
let OFFSETOF_CRC = Self::pg_control_crc_offset();
|
||||
let expectedcrc = crc32c::crc32c(&buf[0..OFFSETOF_CRC]);
|
||||
|
||||
// Convert the slice into an array of the right size, and use `transmute` to
|
||||
// reinterpret the raw bytes as a ControlFileData struct.
|
||||
//
|
||||
// NB: Ideally we would use 'zerocopy::FromBytes' for this, but bindgen doesn't
|
||||
// derive FromBytes for us. The safety of this depends on the same constraints
|
||||
// as for FromBytes, namely, all of its fields must implement FromBytes. That
|
||||
// includes the primitive integer types, like `u8`, `u16`, `u32`, `u64` and their
|
||||
// signed variants. But `bool` is not safe, because the contents of the high bits
|
||||
// in a rust bool are undefined. In practice, PostgreSQL uses 1 to represent
|
||||
// true and 0 for false, which is compatible with Rust bool, but let's try not to
|
||||
// depend on it.
|
||||
//
|
||||
// FIXME: ControlFileData does contain 'bool's at the moment.
|
||||
//
|
||||
// See https://github.com/zenithdb/zenith/issues/207 for discussion on the safety
|
||||
// of this.
|
||||
let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA];
|
||||
b.copy_from_slice(&buf[0..SIZEOF_CONTROLDATA]);
|
||||
let controlfile: ControlFileData =
|
||||
unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
|
||||
|
||||
// Check the CRC
|
||||
if expectedcrc != controlfile.crc {
|
||||
bail!(
|
||||
"invalid CRC in control file: expected {:08X}, was {:08X}",
|
||||
expectedcrc,
|
||||
controlfile.crc
|
||||
);
|
||||
}
|
||||
|
||||
Ok(controlfile)
|
||||
}
|
||||
|
||||
///
|
||||
/// Convert a struct representing a Postgres control file into raw bytes.
|
||||
///
|
||||
/// The CRC is recomputed to match the contents of the fields.
|
||||
pub fn encode(&self) -> Bytes {
|
||||
//
|
||||
// Use `transmute` to reinterpret struct as raw bytes.
|
||||
//
|
||||
// FIXME: This triggers undefined behavior, because the contents
|
||||
// of the padding bytes are undefined, and this leaks those
|
||||
// undefined bytes into the resulting array. The Rust code won't
|
||||
// care what's in those bytes, and PostgreSQL doesn't care
|
||||
// either. HOWEVER, it is a potential security issue, because the
|
||||
// bytes can contain arbitrary pieces of memory from the page
|
||||
// server. In the worst case, that could be private keys or
|
||||
// another tenant's data.
|
||||
//
|
||||
// See https://github.com/zenithdb/zenith/issues/207 for discussion.
|
||||
let b: [u8; SIZEOF_CONTROLDATA] =
|
||||
unsafe { std::mem::transmute::<ControlFileData, [u8; SIZEOF_CONTROLDATA]>(*self) };
|
||||
|
||||
// Recompute the CRC
|
||||
let OFFSETOF_CRC = Self::pg_control_crc_offset();
|
||||
let newcrc = crc32c::crc32c(&b[0..OFFSETOF_CRC]);
|
||||
|
||||
let mut buf = BytesMut::with_capacity(PG_CONTROL_FILE_SIZE as usize);
|
||||
buf.extend_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
buf.extend_from_slice(&newcrc.to_ne_bytes());
|
||||
// Fill the rest of the control file with zeros.
|
||||
buf.resize(PG_CONTROL_FILE_SIZE as usize, 0);
|
||||
|
||||
buf.into()
|
||||
}
|
||||
}
|
||||
10
postgres_ffi/src/lib.rs
Normal file
10
postgres_ffi/src/lib.rs
Normal file
@@ -0,0 +1,10 @@
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
|
||||
pub mod controlfile_utils;
|
||||
pub mod nonrelfile_utils;
|
||||
pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
pub mod xlog_utils;
|
||||
32
postgres_ffi/src/nonrelfile_utils.rs
Normal file
32
postgres_ffi/src/nonrelfile_utils.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
//!
|
||||
//! Common utilities for dealing with PostgreSQL non-relation files.
|
||||
//!
|
||||
use crate::pg_constants;
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
|
||||
pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
|
||||
trace!(
|
||||
"handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
|
||||
status
|
||||
);
|
||||
|
||||
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
|
||||
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
|
||||
|
||||
let bshift: u8 =
|
||||
((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
|
||||
|
||||
page[byteno] =
|
||||
(page[byteno] & !(pg_constants::CLOG_XACT_BITMASK << bshift)) | (status << bshift);
|
||||
}
|
||||
|
||||
pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 {
|
||||
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
|
||||
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
|
||||
|
||||
let bshift: u8 =
|
||||
((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
|
||||
|
||||
((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8
|
||||
}
|
||||
187
postgres_ffi/src/pg_constants.rs
Normal file
187
postgres_ffi/src/pg_constants.rs
Normal file
@@ -0,0 +1,187 @@
|
||||
//!
|
||||
//! Misc constants, copied from PostgreSQL headers.
|
||||
//!
|
||||
//! TODO: These probably should be auto-generated using bindgen,
|
||||
//! rather than copied by hand. Although on the other hand, it's nice
|
||||
//! to have them all here in one place, and have the ability to add
|
||||
//! comments on them.
|
||||
//!
|
||||
|
||||
//
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
pub const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
|
||||
//
|
||||
// Fork numbers, from relpath.h
|
||||
//
|
||||
pub const MAIN_FORKNUM: u8 = 0;
|
||||
pub const FSM_FORKNUM: u8 = 1;
|
||||
pub const VISIBILITYMAP_FORKNUM: u8 = 2;
|
||||
pub const INIT_FORKNUM: u8 = 3;
|
||||
|
||||
// From storage_xlog.h
|
||||
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
|
||||
pub const SMGR_TRUNCATE_VM: u32 = 0x0002;
|
||||
pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
|
||||
|
||||
// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and
|
||||
// --with-segsize=SEGSIZE, but assume the defaults for now.
|
||||
pub const BLCKSZ: u16 = 8192;
|
||||
pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32);
|
||||
|
||||
//
|
||||
// constants from clog.h
|
||||
//
|
||||
pub const CLOG_XACTS_PER_BYTE: u32 = 4;
|
||||
pub const CLOG_XACTS_PER_PAGE: u32 = BLCKSZ as u32 * CLOG_XACTS_PER_BYTE;
|
||||
pub const CLOG_BITS_PER_XACT: u8 = 2;
|
||||
pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
|
||||
|
||||
//
|
||||
// Constants from visbilitymap.h
|
||||
//
|
||||
pub const SIZE_OF_PAGE_HEADER: u16 = 24;
|
||||
pub const BITS_PER_HEAPBLOCK: u16 = 2;
|
||||
pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;
|
||||
|
||||
pub const TRANSACTION_STATUS_IN_PROGRESS: u8 = 0x00;
|
||||
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
|
||||
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
|
||||
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
|
||||
|
||||
pub const CLOG_ZEROPAGE: u8 = 0x00;
|
||||
pub const CLOG_TRUNCATE: u8 = 0x10;
|
||||
|
||||
// From xact.h
|
||||
pub const XLOG_XACT_COMMIT: u8 = 0x00;
|
||||
pub const XLOG_XACT_PREPARE: u8 = 0x10;
|
||||
pub const XLOG_XACT_ABORT: u8 = 0x20;
|
||||
pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
|
||||
pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
|
||||
|
||||
// From srlu.h
|
||||
pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
|
||||
pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;
|
||||
|
||||
/* mask for filtering opcodes out of xl_info */
|
||||
pub const XLOG_XACT_OPMASK: u8 = 0x70;
|
||||
pub const XLOG_HEAP_OPMASK: u8 = 0x70;
|
||||
/* does this record have a 'xinfo' field or not */
|
||||
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
|
||||
|
||||
/*
|
||||
* The following flags, stored in xinfo, determine which information is
|
||||
* contained in commit/abort records.
|
||||
*/
|
||||
pub const XACT_XINFO_HAS_DBINFO: u32 = 1u32 << 0;
|
||||
pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
|
||||
pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
|
||||
pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
|
||||
pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
|
||||
// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
|
||||
// pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
|
||||
// pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;
|
||||
|
||||
// From pg_control.h and rmgrlist.h
|
||||
pub const XLOG_NEXTOID: u8 = 0x30;
|
||||
pub const XLOG_SWITCH: u8 = 0x40;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
pub const DB_SHUTDOWNED: u32 = 1;
|
||||
|
||||
// From multixact.h
|
||||
pub const FIRST_MULTIXACT_ID: u32 = 1;
|
||||
pub const MAX_MULTIXACT_ID: u32 = 0xFFFFFFFF;
|
||||
|
||||
pub const XLOG_MULTIXACT_ZERO_OFF_PAGE: u8 = 0x00;
|
||||
pub const XLOG_MULTIXACT_ZERO_MEM_PAGE: u8 = 0x10;
|
||||
pub const XLOG_MULTIXACT_CREATE_ID: u8 = 0x20;
|
||||
pub const XLOG_MULTIXACT_TRUNCATE_ID: u8 = 0x30;
|
||||
|
||||
pub const MULTIXACT_OFFSETS_PER_PAGE: u16 = BLCKSZ / 4;
|
||||
pub const MXACT_MEMBER_BITS_PER_XACT: u16 = 8;
|
||||
pub const MXACT_MEMBER_FLAGS_PER_BYTE: u16 = 1;
|
||||
pub const MULTIXACT_FLAGBYTES_PER_GROUP: u16 = 4;
|
||||
pub const MULTIXACT_MEMBERS_PER_MEMBERGROUP: u16 =
|
||||
MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE;
|
||||
/* size in bytes of a complete group */
|
||||
pub const MULTIXACT_MEMBERGROUP_SIZE: u16 =
|
||||
4 * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP;
|
||||
pub const MULTIXACT_MEMBERGROUPS_PER_PAGE: u16 = BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE;
|
||||
pub const MULTIXACT_MEMBERS_PER_PAGE: u16 =
|
||||
MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP;
|
||||
|
||||
// From heapam_xlog.h
|
||||
pub const XLOG_HEAP_INSERT: u8 = 0x00;
|
||||
pub const XLOG_HEAP_DELETE: u8 = 0x10;
|
||||
pub const XLOG_HEAP_UPDATE: u8 = 0x20;
|
||||
pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
|
||||
pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
|
||||
pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
|
||||
pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
|
||||
pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
|
||||
pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
|
||||
pub const RM_XLOG_ID: u8 = 0;
|
||||
pub const RM_XACT_ID: u8 = 1;
|
||||
pub const RM_SMGR_ID: u8 = 2;
|
||||
pub const RM_CLOG_ID: u8 = 3;
|
||||
pub const RM_DBASE_ID: u8 = 4;
|
||||
pub const RM_TBLSPC_ID: u8 = 5;
|
||||
pub const RM_MULTIXACT_ID: u8 = 6;
|
||||
pub const RM_RELMAP_ID: u8 = 7;
|
||||
pub const RM_STANDBY_ID: u8 = 8;
|
||||
pub const RM_HEAP2_ID: u8 = 9;
|
||||
pub const RM_HEAP_ID: u8 = 10;
|
||||
|
||||
// from xlogreader.h
|
||||
pub const XLR_INFO_MASK: u8 = 0x0F;
|
||||
pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
|
||||
|
||||
// from dbcommands_xlog.h
|
||||
pub const XLOG_DBASE_CREATE: u8 = 0x00;
|
||||
pub const XLOG_DBASE_DROP: u8 = 0x10;
|
||||
|
||||
pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
|
||||
pub const XLOG_TBLSPC_DROP: u8 = 0x10;
|
||||
|
||||
pub const SIZEOF_XLOGRECORD: u32 = 24;
|
||||
|
||||
//
|
||||
// from xlogrecord.h
|
||||
//
|
||||
pub const XLR_MAX_BLOCK_ID: u8 = 32;
|
||||
|
||||
pub const XLR_BLOCK_ID_DATA_SHORT: u8 = 255;
|
||||
pub const XLR_BLOCK_ID_DATA_LONG: u8 = 254;
|
||||
pub const XLR_BLOCK_ID_ORIGIN: u8 = 253;
|
||||
pub const XLR_BLOCK_ID_TOPLEVEL_XID: u8 = 252;
|
||||
|
||||
pub const BKPBLOCK_FORK_MASK: u8 = 0x0F;
|
||||
pub const _BKPBLOCK_FLAG_MASK: u8 = 0xF0;
|
||||
pub const BKPBLOCK_HAS_IMAGE: u8 = 0x10; /* block data is an XLogRecordBlockImage */
|
||||
pub const BKPBLOCK_HAS_DATA: u8 = 0x20;
|
||||
pub const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */
|
||||
pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */
|
||||
|
||||
/* Information stored in bimg_info */
|
||||
pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
|
||||
pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
|
||||
pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
|
||||
|
||||
/* From transam.h */
|
||||
pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3;
|
||||
pub const INVALID_TRANSACTION_ID: u32 = 0;
|
||||
pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
|
||||
pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
|
||||
|
||||
/* FIXME: pageserver should request wal_seg_size from compute node */
|
||||
pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;
|
||||
|
||||
pub const XLOG_BLCKSZ: usize = 8192;
|
||||
pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
|
||||
pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
|
||||
pub const XLP_LONG_HEADER: u16 = 0x0002;
|
||||
141
postgres_ffi/src/relfile_utils.rs
Normal file
141
postgres_ffi/src/relfile_utils.rs
Normal file
@@ -0,0 +1,141 @@
|
||||
//!
|
||||
//! Common utilities for dealing with PostgreSQL relation files.
|
||||
//!
|
||||
use crate::pg_constants;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
#[derive(Debug, Clone, thiserror::Error, PartialEq)]
|
||||
pub enum FilePathError {
|
||||
#[error("invalid relation fork name")]
|
||||
InvalidForkName,
|
||||
#[error("invalid relation data file name")]
|
||||
InvalidFileName,
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(_e: core::num::ParseIntError) -> Self {
|
||||
FilePathError::InvalidFileName
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert Postgres relation file's fork suffix to fork number.
|
||||
pub fn forkname_to_number(forkname: Option<&str>) -> Result<u8, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(pg_constants::MAIN_FORKNUM),
|
||||
Some("fsm") => Ok(pg_constants::FSM_FORKNUM),
|
||||
Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM),
|
||||
Some("init") => Ok(pg_constants::INIT_FORKNUM),
|
||||
Some(_) => Err(FilePathError::InvalidForkName),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert Postgres fork number to the right suffix of the relation data file.
|
||||
pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
|
||||
match forknum {
|
||||
pg_constants::MAIN_FORKNUM => None,
|
||||
pg_constants::FSM_FORKNUM => Some("fsm"),
|
||||
pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"),
|
||||
pg_constants::INIT_FORKNUM => Some("init"),
|
||||
_ => Some("UNKNOWN FORKNUM"),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
|
||||
///
|
||||
/// Formats:
|
||||
/// <oid>
|
||||
/// <oid>_<fork name>
|
||||
/// <oid>.<segment number>
|
||||
/// <oid>_<fork name>.<segment number>
|
||||
///
|
||||
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
|
||||
///
|
||||
pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
|
||||
lazy_static! {
|
||||
static ref RELFILE_RE: Regex =
|
||||
Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
|
||||
}
|
||||
let caps = RELFILE_RE
|
||||
.captures(fname)
|
||||
.ok_or(FilePathError::InvalidFileName)?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = relnode_str.parse::<u32>()?;
|
||||
|
||||
let forkname = caps.name("forkname").map(|f| f.as_str());
|
||||
let forknum = forkname_to_number(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
segno_match.unwrap().as_str().parse::<u32>()?
|
||||
};
|
||||
|
||||
Ok((relnode, forknum, segno))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_valid_relfilenames() {
|
||||
assert_eq!(parse_relfilename("1234"), Ok((1234, 0, 0)));
|
||||
assert_eq!(parse_relfilename("1234_fsm"), Ok((1234, 1, 0)));
|
||||
assert_eq!(parse_relfilename("1234_vm"), Ok((1234, 2, 0)));
|
||||
assert_eq!(parse_relfilename("1234_init"), Ok((1234, 3, 0)));
|
||||
|
||||
assert_eq!(parse_relfilename("1234.12"), Ok((1234, 0, 12)));
|
||||
assert_eq!(parse_relfilename("1234_fsm.12"), Ok((1234, 1, 12)));
|
||||
assert_eq!(parse_relfilename("1234_vm.12"), Ok((1234, 2, 12)));
|
||||
assert_eq!(parse_relfilename("1234_init.12"), Ok((1234, 3, 12)));
|
||||
|
||||
// relfilenode is unsigned, so it can go up to 2^32-1
|
||||
assert_eq!(parse_relfilename("3147483648"), Ok((3147483648, 0, 0)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_invalid_relfilenames() {
|
||||
assert_eq!(
|
||||
parse_relfilename("foo"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_relfilename("1.2.3"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_relfilename("1234_invalid"),
|
||||
Err(FilePathError::InvalidForkName)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_relfilename("1234_"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
|
||||
// too large for u32
|
||||
assert_eq!(
|
||||
parse_relfilename("12345678901"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_relfilename("-1234"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_weird_relfilenames() {
|
||||
// we accept 0 for the relfilenode, but PostgreSQL should never do that.
|
||||
assert_eq!(parse_relfilename("0"), Ok((0, 0, 0)));
|
||||
|
||||
// PostgreSQL has a limit of 2^32-2 blocks in a table. With 8k block size and
|
||||
// 1 GB segments, the max segment number is 32767. But we accept larger values
|
||||
// currently.
|
||||
assert_eq!(parse_relfilename("1.123456"), Ok((1, 0, 123456)));
|
||||
}
|
||||
}
|
||||
453
postgres_ffi/src/xlog_utils.rs
Normal file
453
postgres_ffi/src/xlog_utils.rs
Normal file
@@ -0,0 +1,453 @@
|
||||
//
|
||||
// This file contains common utilities for dealing with PostgreSQL WAL files and
|
||||
// LSNs.
|
||||
//
|
||||
// Many of these functions have been copied from PostgreSQL, and rewritten in
|
||||
// Rust. That's why they don't follow the usual Rust naming conventions, they
|
||||
// have been named the same as the corresponding PostgreSQL functions instead.
|
||||
//
|
||||
|
||||
use crate::pg_constants;
|
||||
use crate::CheckPoint;
|
||||
use crate::ControlFileData;
|
||||
use crate::FullTransactionId;
|
||||
use crate::XLogLongPageHeaderData;
|
||||
use crate::XLogPageHeaderData;
|
||||
use crate::XLogRecord;
|
||||
use crate::XLOG_PAGE_MAGIC;
|
||||
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, Bytes};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use std::fs::{self, File};
|
||||
use std::io::prelude::*;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::SystemTime;
|
||||
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
pub const XLOG_BLCKSZ: usize = 8192;
|
||||
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
|
||||
pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
|
||||
pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
|
||||
pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
||||
|
||||
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::<XLogPageHeaderData>();
|
||||
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::<XLogLongPageHeaderData>();
|
||||
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
|
||||
pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
|
||||
|
||||
pub type XLogRecPtr = u64;
|
||||
pub type TimeLineID = u32;
|
||||
pub type TimestampTz = i64;
|
||||
pub type XLogSegNo = u64;
|
||||
|
||||
const XID_CHECKPOINT_INTERVAL: u32 = 1024;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
(0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegNoOffsetToRecPtr(
|
||||
segno: XLogSegNo,
|
||||
offset: u32,
|
||||
wal_segsz_bytes: usize,
|
||||
) -> XLogRecPtr {
|
||||
segno * (wal_segsz_bytes as u64) + (offset as u64)
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
|
||||
return format!(
|
||||
"{:>08X}{:>08X}{:>08X}",
|
||||
tli,
|
||||
logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes),
|
||||
logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)
|
||||
);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
|
||||
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
|
||||
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
|
||||
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
|
||||
(log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn IsXLogFileName(fname: &str) -> bool {
|
||||
return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn IsPartialXLogFileName(fname: &str) -> bool {
|
||||
fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
|
||||
}
|
||||
|
||||
pub fn get_current_timestamp() -> TimestampTz {
|
||||
const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
|
||||
const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
|
||||
const SECS_PER_DAY: u64 = 86400;
|
||||
const USECS_PER_SEC: u64 = 1000000;
|
||||
match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) {
|
||||
Ok(n) => {
|
||||
((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
|
||||
* USECS_PER_SEC
|
||||
+ n.subsec_micros() as u64) as i64
|
||||
}
|
||||
Err(_) => panic!("SystemTime before UNIX EPOCH!"),
|
||||
}
|
||||
}
|
||||
|
||||
fn find_end_of_wal_segment(
|
||||
data_dir: &Path,
|
||||
segno: XLogSegNo,
|
||||
tli: TimeLineID,
|
||||
wal_seg_size: usize,
|
||||
) -> u32 {
|
||||
let mut offs: usize = 0;
|
||||
let mut contlen: usize = 0;
|
||||
let mut wal_crc: u32 = 0;
|
||||
let mut crc: u32 = 0;
|
||||
let mut rec_offs: usize = 0;
|
||||
let mut buf = [0u8; XLOG_BLCKSZ];
|
||||
let file_name = XLogFileName(tli, segno, wal_seg_size);
|
||||
let mut last_valid_rec_pos: usize = 0;
|
||||
let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap();
|
||||
let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS];
|
||||
|
||||
while offs < wal_seg_size {
|
||||
if offs % XLOG_BLCKSZ == 0 {
|
||||
if let Ok(bytes_read) = file.read(&mut buf) {
|
||||
if bytes_read != buf.len() {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
let xlp_magic = LittleEndian::read_u16(&buf[0..2]);
|
||||
let xlp_info = LittleEndian::read_u16(&buf[2..4]);
|
||||
let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]);
|
||||
if xlp_magic != XLOG_PAGE_MAGIC as u16 {
|
||||
info!("Invalid WAL file {}.partial magic {}", file_name, xlp_magic);
|
||||
break;
|
||||
}
|
||||
if offs == 0 {
|
||||
offs = XLOG_SIZE_OF_XLOG_LONG_PHD;
|
||||
if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 {
|
||||
offs += ((xlp_rem_len + 7) & !7) as usize;
|
||||
}
|
||||
} else {
|
||||
offs += XLOG_SIZE_OF_XLOG_SHORT_PHD;
|
||||
}
|
||||
} else if contlen == 0 {
|
||||
let page_offs = offs % XLOG_BLCKSZ;
|
||||
let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize;
|
||||
if xl_tot_len == 0 {
|
||||
break;
|
||||
}
|
||||
last_valid_rec_pos = offs;
|
||||
offs += 4;
|
||||
rec_offs = 4;
|
||||
contlen = xl_tot_len - 4;
|
||||
rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs + 4]);
|
||||
} else {
|
||||
let page_offs = offs % XLOG_BLCKSZ;
|
||||
// we're continuing a record, possibly from previous page.
|
||||
let pageleft = XLOG_BLCKSZ - page_offs;
|
||||
|
||||
// read the rest of the record, or as much as fits on this page.
|
||||
let n = min(contlen, pageleft);
|
||||
if rec_offs < XLOG_RECORD_CRC_OFFS {
|
||||
let len = min(XLOG_RECORD_CRC_OFFS - rec_offs, n);
|
||||
rec_hdr[rec_offs..rec_offs + len].copy_from_slice(&buf[page_offs..page_offs + len]);
|
||||
}
|
||||
if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD {
|
||||
let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS;
|
||||
wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]);
|
||||
crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]);
|
||||
crc = !crc;
|
||||
} else {
|
||||
crc ^= 0xFFFFFFFFu32;
|
||||
crc = crc32c_append(crc, &buf[page_offs..page_offs + n]);
|
||||
crc = !crc;
|
||||
}
|
||||
rec_offs += n;
|
||||
offs += n;
|
||||
contlen -= n;
|
||||
|
||||
if contlen == 0 {
|
||||
crc = !crc;
|
||||
crc = crc32c_append(crc, &rec_hdr);
|
||||
offs = (offs + 7) & !7; // pad on 8 bytes boundary */
|
||||
if crc == wal_crc {
|
||||
last_valid_rec_pos = offs;
|
||||
} else {
|
||||
info!(
|
||||
"CRC mismatch {} vs {} at {}",
|
||||
crc, wal_crc, last_valid_rec_pos
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
last_valid_rec_pos as u32
|
||||
}
|
||||
|
||||
///
|
||||
/// Scan a directory that contains PostgreSQL WAL files, for the end of WAL.
|
||||
///
|
||||
pub fn find_end_of_wal(
|
||||
data_dir: &Path,
|
||||
wal_seg_size: usize,
|
||||
precise: bool,
|
||||
) -> (XLogRecPtr, TimeLineID) {
|
||||
let mut high_segno: XLogSegNo = 0;
|
||||
let mut high_tli: TimeLineID = 0;
|
||||
let mut high_ispartial = false;
|
||||
|
||||
for entry in fs::read_dir(data_dir).unwrap().flatten() {
|
||||
let ispartial: bool;
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
/*
|
||||
* Check if the filename looks like an xlog file, or a .partial file.
|
||||
*/
|
||||
if IsXLogFileName(fname) {
|
||||
ispartial = false;
|
||||
} else if IsPartialXLogFileName(fname) {
|
||||
ispartial = true;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
|
||||
if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
|
||||
continue;
|
||||
}
|
||||
if segno > high_segno
|
||||
|| (segno == high_segno && tli > high_tli)
|
||||
|| (segno == high_segno && tli == high_tli && high_ispartial && !ispartial)
|
||||
{
|
||||
high_segno = segno;
|
||||
high_tli = tli;
|
||||
high_ispartial = ispartial;
|
||||
}
|
||||
}
|
||||
if high_segno > 0 {
|
||||
let mut high_offs = 0;
|
||||
/*
|
||||
* Move the starting pointer to the start of the next segment, if the
|
||||
* highest one we saw was completed.
|
||||
*/
|
||||
if !high_ispartial {
|
||||
high_segno += 1;
|
||||
} else if precise {
|
||||
/* otherwise locate last record in last partial segment */
|
||||
high_offs = find_end_of_wal_segment(data_dir, high_segno, high_tli, wal_seg_size);
|
||||
}
|
||||
let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
|
||||
return (high_ptr, high_tli);
|
||||
}
|
||||
(0, 0)
|
||||
}
|
||||
|
||||
pub fn main() {
|
||||
let mut data_dir = PathBuf::new();
|
||||
data_dir.push(".");
|
||||
let wal_seg_size = 16 * 1024 * 1024;
|
||||
let (wal_end, tli) = find_end_of_wal(&data_dir, wal_seg_size, true);
|
||||
println!(
|
||||
"wal_end={:>08X}{:>08X}, tli={}",
|
||||
(wal_end >> 32) as u32,
|
||||
wal_end as u32,
|
||||
tli
|
||||
);
|
||||
}
|
||||
|
||||
impl XLogRecord {
|
||||
pub fn from_bytes(buf: &mut Bytes) -> XLogRecord {
|
||||
XLogRecord {
|
||||
xl_tot_len: buf.get_u32_le(),
|
||||
xl_xid: buf.get_u32_le(),
|
||||
xl_prev: buf.get_u64_le(),
|
||||
xl_info: buf.get_u8(),
|
||||
xl_rmid: buf.get_u8(),
|
||||
xl_crc: {
|
||||
buf.advance(2);
|
||||
buf.get_u32_le()
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode(&self) -> Bytes {
|
||||
let b: [u8; XLOG_SIZE_OF_XLOG_RECORD];
|
||||
b = unsafe { std::mem::transmute::<XLogRecord, [u8; XLOG_SIZE_OF_XLOG_RECORD]>(*self) };
|
||||
Bytes::copy_from_slice(&b[..])
|
||||
}
|
||||
|
||||
// Is this record an XLOG_SWITCH record? They need some special processing,
|
||||
pub fn is_xlog_switch_record(&self) -> bool {
|
||||
self.xl_info == pg_constants::XLOG_SWITCH && self.xl_rmid == pg_constants::RM_XLOG_ID
|
||||
}
|
||||
}
|
||||
|
||||
impl XLogPageHeaderData {
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogPageHeaderData {
|
||||
let hdr: XLogPageHeaderData = XLogPageHeaderData {
|
||||
xlp_magic: buf.get_u16_le(),
|
||||
xlp_info: buf.get_u16_le(),
|
||||
xlp_tli: buf.get_u32_le(),
|
||||
xlp_pageaddr: buf.get_u64_le(),
|
||||
xlp_rem_len: buf.get_u32_le(),
|
||||
};
|
||||
buf.get_u32_le(); //padding
|
||||
hdr
|
||||
}
|
||||
}
|
||||
|
||||
impl XLogLongPageHeaderData {
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogLongPageHeaderData {
|
||||
XLogLongPageHeaderData {
|
||||
std: XLogPageHeaderData::from_bytes(buf),
|
||||
xlp_sysid: buf.get_u64_le(),
|
||||
xlp_seg_size: buf.get_u32_le(),
|
||||
xlp_xlog_blcksz: buf.get_u32_le(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode(&self) -> Bytes {
|
||||
let b: [u8; XLOG_SIZE_OF_XLOG_LONG_PHD];
|
||||
b = unsafe {
|
||||
std::mem::transmute::<XLogLongPageHeaderData, [u8; XLOG_SIZE_OF_XLOG_LONG_PHD]>(*self)
|
||||
};
|
||||
Bytes::copy_from_slice(&b[..])
|
||||
}
|
||||
}
|
||||
|
||||
pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();
|
||||
|
||||
impl CheckPoint {
|
||||
pub fn new(lsn: u64, timeline: u32) -> CheckPoint {
|
||||
CheckPoint {
|
||||
redo: lsn,
|
||||
ThisTimeLineID: timeline,
|
||||
PrevTimeLineID: timeline,
|
||||
fullPageWrites: true, // TODO: get actual value of full_page_writes
|
||||
nextXid: FullTransactionId {
|
||||
value: pg_constants::FIRST_NORMAL_TRANSACTION_ID as u64,
|
||||
}, // TODO: handle epoch?
|
||||
nextOid: pg_constants::FIRST_BOOTSTRAP_OBJECT_ID,
|
||||
nextMulti: 1,
|
||||
nextMultiOffset: 0,
|
||||
oldestXid: pg_constants::FIRST_NORMAL_TRANSACTION_ID,
|
||||
oldestXidDB: 0,
|
||||
oldestMulti: 1,
|
||||
oldestMultiDB: 0,
|
||||
time: 0,
|
||||
oldestCommitTsXid: 0,
|
||||
newestCommitTsXid: 0,
|
||||
oldestActiveXid: pg_constants::INVALID_TRANSACTION_ID,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode(&self) -> Bytes {
|
||||
let b: [u8; SIZEOF_CHECKPOINT];
|
||||
b = unsafe { std::mem::transmute::<CheckPoint, [u8; SIZEOF_CHECKPOINT]>(*self) };
|
||||
Bytes::copy_from_slice(&b[..])
|
||||
}
|
||||
|
||||
pub fn decode(buf: &[u8]) -> Result<CheckPoint, anyhow::Error> {
|
||||
let mut b = [0u8; SIZEOF_CHECKPOINT];
|
||||
b.copy_from_slice(&buf[0..SIZEOF_CHECKPOINT]);
|
||||
let checkpoint: CheckPoint;
|
||||
checkpoint = unsafe { std::mem::transmute::<[u8; SIZEOF_CHECKPOINT], CheckPoint>(b) };
|
||||
Ok(checkpoint)
|
||||
}
|
||||
|
||||
// Update next XID based on provided new_xid and stored epoch.
|
||||
// Next XID should be greater than new_xid.
|
||||
// Also take in account 32-bit wrap-around.
|
||||
pub fn update_next_xid(&mut self, xid: u32) {
|
||||
let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
|
||||
let full_xid = self.nextXid.value;
|
||||
let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
let old_xid = full_xid as u32;
|
||||
if new_xid.wrapping_sub(old_xid) as i32 > 0 {
|
||||
let mut epoch = full_xid >> 32;
|
||||
if new_xid < old_xid {
|
||||
// wrap-around
|
||||
epoch += 1;
|
||||
}
|
||||
self.nextXid = FullTransactionId {
|
||||
value: (epoch << 32) | new_xid as u64,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record.
|
||||
// We need this segment to start compute node.
|
||||
// In order to minimize changes in Postgres core, we prefer to
|
||||
// provide WAL segment from which is can extract checkpoint record in standard way,
|
||||
// rather then implement some alternative mechanism.
|
||||
//
|
||||
pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
|
||||
let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);
|
||||
|
||||
let hdr = XLogLongPageHeaderData {
|
||||
std: {
|
||||
XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info: pg_constants::XLP_LONG_HEADER,
|
||||
xlp_tli: 1, // FIXME: always use Postgres timeline 1
|
||||
xlp_pageaddr: pg_control.checkPoint - XLOG_SIZE_OF_XLOG_LONG_PHD as u64,
|
||||
xlp_rem_len: 0,
|
||||
}
|
||||
},
|
||||
xlp_sysid: pg_control.system_identifier,
|
||||
xlp_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32,
|
||||
xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
|
||||
};
|
||||
|
||||
let hdr_bytes = hdr.encode();
|
||||
seg_buf.extend_from_slice(&hdr_bytes);
|
||||
|
||||
let rec_hdr = XLogRecord {
|
||||
xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD
|
||||
+ SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT
|
||||
+ SIZEOF_CHECKPOINT) as u32,
|
||||
xl_xid: 0, //0 is for InvalidTransactionId
|
||||
xl_prev: 0,
|
||||
xl_info: pg_constants::XLOG_CHECKPOINT_SHUTDOWN,
|
||||
xl_rmid: pg_constants::RM_XLOG_ID,
|
||||
xl_crc: 0,
|
||||
};
|
||||
|
||||
let mut rec_shord_hdr_bytes = BytesMut::new();
|
||||
rec_shord_hdr_bytes.put_u8(pg_constants::XLR_BLOCK_ID_DATA_SHORT);
|
||||
rec_shord_hdr_bytes.put_u8(SIZEOF_CHECKPOINT as u8);
|
||||
|
||||
let rec_bytes = rec_hdr.encode();
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
|
||||
//calculate record checksum
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &rec_shord_hdr_bytes[..]);
|
||||
crc = crc32c_append(crc, &checkpoint_bytes[..]);
|
||||
crc = crc32c_append(crc, &rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
|
||||
|
||||
seg_buf.extend_from_slice(&rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
|
||||
seg_buf.put_u32_le(crc);
|
||||
seg_buf.extend_from_slice(&rec_shord_hdr_bytes);
|
||||
seg_buf.extend_from_slice(&checkpoint_bytes);
|
||||
|
||||
//zero out the rest of the file
|
||||
seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
|
||||
seg_buf.freeze()
|
||||
}
|
||||
21
proxy/Cargo.toml
Normal file
21
proxy/Cargo.toml
Normal file
@@ -0,0 +1,21 @@
|
||||
[package]
|
||||
name = "proxy"
|
||||
version = "0.1.0"
|
||||
authors = ["Stas Kelvich <stas.kelvich@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
bytes = { version = "1.0.1", features = ['serde'] }
|
||||
md5 = "0.7.0"
|
||||
rand = "0.8.3"
|
||||
hex = "0.4.3"
|
||||
serde = "1"
|
||||
serde_json = "1"
|
||||
tokio = { version = "1.7.1", features = ["full"] }
|
||||
tokio-postgres = "0.7.2"
|
||||
clap = "2.33.0"
|
||||
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
92
proxy/src/cplane_api.rs
Normal file
92
proxy/src/cplane_api.rs
Normal file
@@ -0,0 +1,92 @@
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
net::{IpAddr, SocketAddr},
|
||||
};
|
||||
|
||||
pub struct CPlaneApi {
|
||||
// address: SocketAddr,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct DatabaseInfo {
|
||||
pub host: IpAddr, // TODO: allow host name here too
|
||||
pub port: u16,
|
||||
pub dbname: String,
|
||||
pub user: String,
|
||||
pub password: String,
|
||||
}
|
||||
|
||||
impl DatabaseInfo {
|
||||
pub fn socket_addr(&self) -> SocketAddr {
|
||||
SocketAddr::new(self.host, self.port)
|
||||
}
|
||||
|
||||
pub fn conn_string(&self) -> String {
|
||||
format!(
|
||||
"dbname={} user={} password={}",
|
||||
self.dbname, self.user, self.password
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// mock cplane api
|
||||
impl CPlaneApi {
|
||||
pub fn new(_address: &SocketAddr) -> CPlaneApi {
|
||||
CPlaneApi {
|
||||
// address: address.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_auth(&self, user: &str, md5_response: &[u8], salt: &[u8; 4]) -> Result<()> {
|
||||
// passwords for both is "mypass"
|
||||
let auth_map: HashMap<_, &str> = vec![
|
||||
("stas@zenith", "716ee6e1c4a9364d66285452c47402b1"),
|
||||
("stas2@zenith", "3996f75df64c16a8bfaf01301b61d582"),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let stored_hash = auth_map
|
||||
.get(&user)
|
||||
.ok_or_else(|| anyhow::Error::msg("user not found"))?;
|
||||
let salted_stored_hash = format!(
|
||||
"md5{:x}",
|
||||
md5::compute([stored_hash.as_bytes(), salt].concat())
|
||||
);
|
||||
|
||||
let received_hash = std::str::from_utf8(&md5_response)?;
|
||||
|
||||
println!(
|
||||
"auth: {} rh={} sh={} ssh={} {:?}",
|
||||
user, received_hash, stored_hash, salted_stored_hash, salt
|
||||
);
|
||||
|
||||
if received_hash == salted_stored_hash {
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("Auth failed")
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_database_uri(&self, _user: &str, _database: &str) -> Result<DatabaseInfo> {
|
||||
Ok(DatabaseInfo {
|
||||
host: "127.0.0.1".parse()?,
|
||||
port: 5432,
|
||||
dbname: "stas".to_string(),
|
||||
user: "stas".to_string(),
|
||||
password: "mypass".to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
// pub fn create_database(&self, _user: &String, _database: &String) -> Result<DatabaseInfo> {
|
||||
// Ok(DatabaseInfo {
|
||||
// host: "127.0.0.1".parse()?,
|
||||
// port: 5432,
|
||||
// dbname: "stas".to_string(),
|
||||
// user: "stas".to_string(),
|
||||
// password: "mypass".to_string(),
|
||||
// })
|
||||
// }
|
||||
}
|
||||
106
proxy/src/main.rs
Normal file
106
proxy/src/main.rs
Normal file
@@ -0,0 +1,106 @@
|
||||
///
|
||||
/// Postgres protocol proxy/router.
|
||||
///
|
||||
/// This service listens psql port and can check auth via external service
|
||||
/// (control plane API in our case) and can create new databases and accounts
|
||||
/// in somewhat transparent manner (again via communication with control plane API).
|
||||
///
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
net::{SocketAddr, TcpListener},
|
||||
sync::{mpsc, Mutex},
|
||||
thread,
|
||||
};
|
||||
|
||||
use clap::{App, Arg};
|
||||
|
||||
use cplane_api::DatabaseInfo;
|
||||
|
||||
mod cplane_api;
|
||||
mod mgmt;
|
||||
mod proxy;
|
||||
|
||||
pub struct ProxyConf {
|
||||
/// main entrypoint for users to connect to
|
||||
pub proxy_address: SocketAddr,
|
||||
|
||||
/// http management endpoint. Upon user account creation control plane
|
||||
/// will notify us here, so that we can 'unfreeze' user session.
|
||||
pub mgmt_address: SocketAddr,
|
||||
|
||||
/// send unauthenticated users to this URI
|
||||
pub redirect_uri: String,
|
||||
|
||||
/// control plane address where we would check auth.
|
||||
pub cplane_address: SocketAddr,
|
||||
}
|
||||
|
||||
pub struct ProxyState {
|
||||
pub conf: ProxyConf,
|
||||
pub waiters: Mutex<HashMap<String, mpsc::Sender<anyhow::Result<DatabaseInfo>>>>,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = App::new("Zenith proxy/router")
|
||||
.arg(
|
||||
Arg::with_name("proxy")
|
||||
.short("p")
|
||||
.long("proxy")
|
||||
.takes_value(true)
|
||||
.help("listen for incoming client connections on ip:port")
|
||||
.default_value("127.0.0.1:4432"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("mgmt")
|
||||
.short("m")
|
||||
.long("mgmt")
|
||||
.takes_value(true)
|
||||
.help("listen for management callback connection on ip:port")
|
||||
.default_value("127.0.0.1:7000"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("uri")
|
||||
.short("u")
|
||||
.long("uri")
|
||||
.takes_value(true)
|
||||
.help("redirect unauthenticated users to given uri")
|
||||
.default_value("http://localhost:3000/psql_session/"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let conf = ProxyConf {
|
||||
proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
|
||||
mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
|
||||
redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
|
||||
cplane_address: "127.0.0.1:3000".parse()?,
|
||||
};
|
||||
let state = ProxyState {
|
||||
conf,
|
||||
waiters: Mutex::new(HashMap::new()),
|
||||
};
|
||||
let state: &'static ProxyState = Box::leak(Box::new(state));
|
||||
|
||||
// Check that we can bind to address before further initialization
|
||||
println!("Starting proxy on {}", state.conf.proxy_address);
|
||||
let pageserver_listener = TcpListener::bind(state.conf.proxy_address)?;
|
||||
|
||||
println!("Starting mgmt on {}", state.conf.mgmt_address);
|
||||
let mgmt_listener = TcpListener::bind(state.conf.mgmt_address)?;
|
||||
|
||||
let threads = vec![
|
||||
// Spawn a thread to listen for connections. It will spawn further threads
|
||||
// for each connection.
|
||||
thread::Builder::new()
|
||||
.name("Proxy thread".into())
|
||||
.spawn(move || proxy::thread_main(&state, pageserver_listener))?,
|
||||
thread::Builder::new()
|
||||
.name("Mgmt thread".into())
|
||||
.spawn(move || mgmt::thread_main(&state, mgmt_listener))?,
|
||||
];
|
||||
|
||||
for t in threads.into_iter() {
|
||||
t.join().unwrap()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
111
proxy/src/mgmt.rs
Normal file
111
proxy/src/mgmt.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
use std::{
|
||||
net::{TcpListener, TcpStream},
|
||||
thread,
|
||||
};
|
||||
|
||||
use anyhow::bail;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::{
|
||||
postgres_backend::{self, query_from_cstring, PostgresBackend},
|
||||
pq_proto::{BeMessage, SINGLE_COL_ROWDESC},
|
||||
};
|
||||
|
||||
use crate::{cplane_api::DatabaseInfo, ProxyState};
|
||||
|
||||
///
|
||||
/// Main proxy listener loop.
|
||||
///
|
||||
/// Listens for connections, and launches a new handler thread for each.
|
||||
///
|
||||
pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept()?;
|
||||
println!("accepted connection from {}", peer_addr);
|
||||
socket.set_nodelay(true).unwrap();
|
||||
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = mgmt_conn_main(state, socket) {
|
||||
println!("error: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
|
||||
let mut conn_handler = MgmtHandler { state };
|
||||
let mut pgbackend = PostgresBackend::new(socket, postgres_backend::AuthType::Trust)?;
|
||||
pgbackend.run(&mut conn_handler)
|
||||
}
|
||||
|
||||
struct MgmtHandler {
|
||||
state: &'static ProxyState,
|
||||
}
|
||||
/// Serialized examples:
|
||||
// {
|
||||
// "session_id": "71d6d03e6d93d99a",
|
||||
// "result": {
|
||||
// "Success": {
|
||||
// "host": "127.0.0.1",
|
||||
// "port": 5432,
|
||||
// "dbname": "stas",
|
||||
// "user": "stas"
|
||||
// "password": "mypass"
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// {
|
||||
// "session_id": "71d6d03e6d93d99a",
|
||||
// "result": {
|
||||
// "Failure": "oops"
|
||||
// }
|
||||
// }
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct PsqlSessionResponse {
|
||||
session_id: String,
|
||||
result: PsqlSessionResult,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum PsqlSessionResult {
|
||||
Success(DatabaseInfo),
|
||||
Failure(String),
|
||||
}
|
||||
|
||||
impl postgres_backend::Handler for MgmtHandler {
|
||||
fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
let query_string = query_from_cstring(query_string);
|
||||
|
||||
println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);
|
||||
|
||||
let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;
|
||||
|
||||
let waiters = self.state.waiters.lock().unwrap();
|
||||
|
||||
let sender = waiters
|
||||
.get(&resp.session_id)
|
||||
.ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
|
||||
|
||||
match resp.result {
|
||||
PsqlSessionResult::Success(db_info) => {
|
||||
sender.send(Ok(db_info))?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
PsqlSessionResult::Failure(message) => {
|
||||
sender.send(Err(anyhow::Error::msg(message.clone())))?;
|
||||
|
||||
bail!("psql session request failed: {}", message)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
256
proxy/src/proxy.rs
Normal file
256
proxy/src/proxy.rs
Normal file
@@ -0,0 +1,256 @@
|
||||
use crate::cplane_api::CPlaneApi;
|
||||
use crate::cplane_api::DatabaseInfo;
|
||||
use crate::ProxyState;
|
||||
|
||||
use anyhow::bail;
|
||||
use tokio_postgres::NoTls;
|
||||
|
||||
use rand::Rng;
|
||||
use std::sync::mpsc::channel;
|
||||
use std::thread;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use zenith_utils::postgres_backend::{PostgresBackend, ProtoState};
|
||||
use zenith_utils::pq_proto::*;
|
||||
use zenith_utils::{postgres_backend, pq_proto::BeMessage};
|
||||
|
||||
///
|
||||
/// Main proxy listener loop.
|
||||
///
|
||||
/// Listens for connections, and launches a new handler thread for each.
|
||||
///
|
||||
pub fn thread_main(
|
||||
state: &'static ProxyState,
|
||||
listener: std::net::TcpListener,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept()?;
|
||||
println!("accepted connection from {}", peer_addr);
|
||||
socket.set_nodelay(true).unwrap();
|
||||
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = proxy_conn_main(state, socket) {
|
||||
println!("error: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// XXX: clean up fields
|
||||
struct ProxyConnection {
|
||||
state: &'static ProxyState,
|
||||
|
||||
cplane: CPlaneApi,
|
||||
|
||||
user: String,
|
||||
database: String,
|
||||
|
||||
pgb: PostgresBackend,
|
||||
md5_salt: [u8; 4],
|
||||
|
||||
psql_session_id: String,
|
||||
}
|
||||
|
||||
pub fn proxy_conn_main(
|
||||
state: &'static ProxyState,
|
||||
socket: std::net::TcpStream,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut conn = ProxyConnection {
|
||||
state,
|
||||
cplane: CPlaneApi::new(&state.conf.cplane_address),
|
||||
user: "".into(),
|
||||
database: "".into(),
|
||||
pgb: PostgresBackend::new(socket, postgres_backend::AuthType::MD5)?,
|
||||
md5_salt: [0u8; 4],
|
||||
psql_session_id: "".into(),
|
||||
};
|
||||
|
||||
// Check StartupMessage
|
||||
// This will set conn.existing_user and we can decide on next actions
|
||||
conn.handle_startup()?;
|
||||
|
||||
// both scenarious here should end up producing database connection string
|
||||
let db_info = if conn.is_existing_user() {
|
||||
conn.handle_existing_user()?
|
||||
} else {
|
||||
conn.handle_new_user()?
|
||||
};
|
||||
|
||||
// ok, proxy pass user connection to database_uri
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let _ = runtime.block_on(proxy_pass(conn.pgb, db_info))?;
|
||||
|
||||
println!("proxy_conn_main done;");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl ProxyConnection {
|
||||
fn is_existing_user(&self) -> bool {
|
||||
self.user.ends_with("@zenith")
|
||||
}
|
||||
|
||||
fn handle_startup(&mut self) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let msg = self.pgb.read_message()?;
|
||||
println!("got message {:?}", msg);
|
||||
match msg {
|
||||
Some(FeMessage::StartupMessage(m)) => {
|
||||
println!("got startup message {:?}", m);
|
||||
|
||||
match m.kind {
|
||||
StartupRequestCode::NegotiateGss | StartupRequestCode::NegotiateSsl => {
|
||||
println!("SSL requested");
|
||||
self.pgb.write_message(&BeMessage::Negotiate)?;
|
||||
}
|
||||
StartupRequestCode::Normal => {
|
||||
self.user = m
|
||||
.params
|
||||
.get("user")
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::msg("user is required in startup packet")
|
||||
})?
|
||||
.into();
|
||||
self.database = m
|
||||
.params
|
||||
.get("database")
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::msg("database is required in startup packet")
|
||||
})?
|
||||
.into();
|
||||
|
||||
break;
|
||||
}
|
||||
StartupRequestCode::Cancel => break,
|
||||
}
|
||||
}
|
||||
None => {
|
||||
bail!("connection closed")
|
||||
}
|
||||
unexpected => {
|
||||
bail!("unexpected message type : {:?}", unexpected)
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_existing_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
// ask password
|
||||
rand::thread_rng().fill(&mut self.md5_salt);
|
||||
self.pgb
|
||||
.write_message(&BeMessage::AuthenticationMD5Password(&self.md5_salt))?;
|
||||
self.pgb.state = ProtoState::Authentication; // XXX
|
||||
|
||||
// check password
|
||||
println!("handle_existing_user");
|
||||
let msg = self.pgb.read_message()?;
|
||||
println!("got message {:?}", msg);
|
||||
if let Some(FeMessage::PasswordMessage(m)) = msg {
|
||||
println!("got password message '{:?}'", m);
|
||||
|
||||
assert!(self.is_existing_user());
|
||||
|
||||
let (_trailing_null, md5_response) = m
|
||||
.split_last()
|
||||
.ok_or_else(|| anyhow::Error::msg("unexpected password message"))?;
|
||||
|
||||
if let Err(e) = self.check_auth_md5(md5_response) {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
|
||||
bail!("auth failed: {}", e);
|
||||
} else {
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
}
|
||||
|
||||
// ok, we are authorized
|
||||
self.cplane.get_database_uri(&self.user, &self.database)
|
||||
}
|
||||
|
||||
fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
let mut psql_session_id_buf = [0u8; 8];
|
||||
rand::thread_rng().fill(&mut psql_session_id_buf);
|
||||
self.psql_session_id = hex::encode(psql_session_id_buf);
|
||||
|
||||
let hello_message = format!("☀️ Welcome to Zenith!
|
||||
|
||||
To proceed with database creation open following link:
|
||||
|
||||
{}{}
|
||||
|
||||
It needed to be done once and we will send you '.pgpass' file which will allow you to access or create
|
||||
databases without opening the browser.
|
||||
|
||||
", self.state.conf.redirect_uri,self.psql_session_id);
|
||||
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb
|
||||
.write_message(&BeMessage::NoticeResponse(hello_message))?;
|
||||
|
||||
// await for database creation
|
||||
let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
|
||||
let _ = self
|
||||
.state
|
||||
.waiters
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(self.psql_session_id.clone(), tx);
|
||||
|
||||
// Wait for web console response
|
||||
// XXX: respond with error to client
|
||||
let dbinfo = rx.recv()??;
|
||||
|
||||
self.pgb.write_message_noflush(&BeMessage::NoticeResponse(
|
||||
"Connecting to database.".to_string(),
|
||||
))?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
|
||||
Ok(dbinfo)
|
||||
}
|
||||
|
||||
fn check_auth_md5(&self, md5_response: &[u8]) -> anyhow::Result<()> {
|
||||
assert!(self.is_existing_user());
|
||||
self.cplane
|
||||
.check_auth(self.user.as_str(), md5_response, &self.md5_salt)
|
||||
}
|
||||
}
|
||||
|
||||
async fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()).await?;
|
||||
let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
|
||||
let _ = config.connect_raw(&mut socket, NoTls).await?;
|
||||
|
||||
println!("Connected to pg, proxying");
|
||||
|
||||
let incoming_std = pgb.into_stream();
|
||||
incoming_std.set_nonblocking(true)?;
|
||||
let mut incoming_conn = tokio::net::TcpStream::from_std(incoming_std)?;
|
||||
|
||||
let (mut ri, mut wi) = incoming_conn.split();
|
||||
let (mut ro, mut wo) = socket.split();
|
||||
|
||||
let client_to_server = async {
|
||||
tokio::io::copy(&mut ri, &mut wo).await?;
|
||||
wo.shutdown().await
|
||||
};
|
||||
|
||||
let server_to_client = async {
|
||||
tokio::io::copy(&mut ro, &mut wi).await?;
|
||||
wi.shutdown().await
|
||||
};
|
||||
|
||||
tokio::try_join!(client_to_server, server_to_client)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
11
run_clippy.sh
Executable file
11
run_clippy.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
|
||||
# If you save this in your path under the name "cargo-zclippy" (or whatever
|
||||
# name you like), then you can run it as "cargo zclippy" from the shell prompt.
|
||||
#
|
||||
# If your text editor has rust-analyzer integration, you can also use this new
|
||||
# command as a replacement for "cargo check" or "cargo clippy" and see clippy
|
||||
# warnings and errors right in the editor.
|
||||
# In vscode, this setting is Rust-analyzer>Check On Save:Command
|
||||
|
||||
cargo clippy "${@:2}" -- -A clippy::new_without_default -A clippy::manual_range_contains -A clippy::comparison_chain
|
||||
18
test_runner/Pipfile
Normal file
18
test_runner/Pipfile
Normal file
@@ -0,0 +1,18 @@
|
||||
[[source]]
|
||||
url = "https://pypi.python.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
pytest = ">=6.0.0"
|
||||
psycopg2 = "*"
|
||||
typing-extensions = "*"
|
||||
|
||||
[dev-packages]
|
||||
yapf = "*"
|
||||
flake8 = "*"
|
||||
mypy = "*"
|
||||
|
||||
[requires]
|
||||
# we need at least 3.6, but pipenv doesn't allow to say this directly
|
||||
python_version = "3"
|
||||
269
test_runner/Pipfile.lock
generated
Normal file
269
test_runner/Pipfile.lock
generated
Normal file
@@ -0,0 +1,269 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "4c20c05c20c50cf7e8f78ab461ab23841125345e63e00e2efa7661c165b6b364"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.python.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
|
||||
"sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==21.2.0"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:833b26fb89d5de469b24a390e9df088d4e52e4ba33b01dc5e0e4f41b81a16c00",
|
||||
"sha256:b142cc1dd1342f31ff04bb7d022492b09920cb64fed867cd3ea6f80fe3ebd139"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==4.5.0"
|
||||
},
|
||||
"iniconfig": {
|
||||
"hashes": [
|
||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||
"sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
|
||||
],
|
||||
"version": "==1.1.1"
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
"sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5",
|
||||
"sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==20.9"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
|
||||
"sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.13.1"
|
||||
},
|
||||
"psycopg2": {
|
||||
"hashes": [
|
||||
"sha256:03a485bf71498870e38b535c0e6e7162d6ac06a91487edddc3b959894d65f79c",
|
||||
"sha256:22102cfeb904898254f287b1a77360bf66c636858e7476593acd5267e5c24ff9",
|
||||
"sha256:8f4c1800e57ad128d20b2e91d222ca238fffd316cef65be781361cdf35e37979",
|
||||
"sha256:b12073fdf2002e828e5921be2c39ff9c6eab361c5c0bd6c529619fc23677accc",
|
||||
"sha256:b6f47af317af8110818d255e693cfa80b7f1e435285be09778db7b66efd95789",
|
||||
"sha256:d549db98fc0e6db41a2aa0d65f7434c4308a9f64012adb209b9e489f26fe87c6",
|
||||
"sha256:e44e39a46af7c30566b7667fb27e701e652ab0a51e05c263a01d3ff0e223b765",
|
||||
"sha256:e84c80be7a238d3c9c099b71f6890eaa35fc881146232cce888a88ab1bfb431e",
|
||||
"sha256:f3d42bd42302293767b84206d9a446abc67ed4a133e4fe04dad8952de06c2091"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.9"
|
||||
},
|
||||
"py": {
|
||||
"hashes": [
|
||||
"sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
|
||||
"sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.10.0"
|
||||
},
|
||||
"pyparsing": {
|
||||
"hashes": [
|
||||
"sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
|
||||
"sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.4.7"
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b",
|
||||
"sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==6.2.4"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497",
|
||||
"sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342",
|
||||
"sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.0"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76",
|
||||
"sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.4.1"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
"flake8": {
|
||||
"hashes": [
|
||||
"sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b",
|
||||
"sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.9.2"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:833b26fb89d5de469b24a390e9df088d4e52e4ba33b01dc5e0e4f41b81a16c00",
|
||||
"sha256:b142cc1dd1342f31ff04bb7d022492b09920cb64fed867cd3ea6f80fe3ebd139"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==4.5.0"
|
||||
},
|
||||
"mccabe": {
|
||||
"hashes": [
|
||||
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
|
||||
"sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
|
||||
],
|
||||
"version": "==0.6.1"
|
||||
},
|
||||
"mypy": {
|
||||
"hashes": [
|
||||
"sha256:0190fb77e93ce971954c9e54ea61de2802065174e5e990c9d4c1d0f54fbeeca2",
|
||||
"sha256:0756529da2dd4d53d26096b7969ce0a47997123261a5432b48cc6848a2cb0bd4",
|
||||
"sha256:2f9fedc1f186697fda191e634ac1d02f03d4c260212ccb018fabbb6d4b03eee8",
|
||||
"sha256:353aac2ce41ddeaf7599f1c73fed2b75750bef3b44b6ad12985a991bc002a0da",
|
||||
"sha256:3f12705eabdd274b98f676e3e5a89f247ea86dc1af48a2d5a2b080abac4e1243",
|
||||
"sha256:4efc67b9b3e2fddbe395700f91d5b8deb5980bfaaccb77b306310bd0b9e002eb",
|
||||
"sha256:517e7528d1be7e187a5db7f0a3e479747307c1b897d9706b1c662014faba3116",
|
||||
"sha256:68a098c104ae2b75e946b107ef69dd8398d54cb52ad57580dfb9fc78f7f997f0",
|
||||
"sha256:746e0b0101b8efec34902810047f26a8c80e1efbb4fc554956d848c05ef85d76",
|
||||
"sha256:8be7bbd091886bde9fcafed8dd089a766fa76eb223135fe5c9e9798f78023a20",
|
||||
"sha256:9236c21194fde5df1b4d8ebc2ef2c1f2a5dc7f18bcbea54274937cae2e20a01c",
|
||||
"sha256:9ef5355eaaf7a23ab157c21a44c614365238a7bdb3552ec3b80c393697d974e1",
|
||||
"sha256:9f1d74eeb3f58c7bd3f3f92b8f63cb1678466a55e2c4612bf36909105d0724ab",
|
||||
"sha256:a26d0e53e90815c765f91966442775cf03b8a7514a4e960de7b5320208b07269",
|
||||
"sha256:ae94c31bb556ddb2310e4f913b706696ccbd43c62d3331cd3511caef466871d2",
|
||||
"sha256:b5ba1f0d5f9087e03bf5958c28d421a03a4c1ad260bf81556195dffeccd979c4",
|
||||
"sha256:b5dfcd22c6bab08dfeded8d5b44bdcb68c6f1ab261861e35c470b89074f78a70",
|
||||
"sha256:cd01c599cf9f897b6b6c6b5d8b182557fb7d99326bcdf5d449a0fbbb4ccee4b9",
|
||||
"sha256:e89880168c67cf4fde4506b80ee42f1537ad66ad366c101d388b3fd7d7ce2afd",
|
||||
"sha256:ebe2bc9cb638475f5d39068d2dbe8ae1d605bb8d8d3ff281c695df1670ab3987",
|
||||
"sha256:f89bfda7f0f66b789792ab64ce0978e4a991a0e4dd6197349d0767b0f1095b21",
|
||||
"sha256:fc4d63da57ef0e8cd4ab45131f3fe5c286ce7dd7f032650d0fbc239c6190e167",
|
||||
"sha256:fd634bc17b1e2d6ce716f0e43446d0d61cdadb1efcad5c56ca211c22b246ebc8"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.902"
|
||||
},
|
||||
"mypy-extensions": {
|
||||
"hashes": [
|
||||
"sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
|
||||
"sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
|
||||
],
|
||||
"version": "==0.4.3"
|
||||
},
|
||||
"pycodestyle": {
|
||||
"hashes": [
|
||||
"sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068",
|
||||
"sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.7.0"
|
||||
},
|
||||
"pyflakes": {
|
||||
"hashes": [
|
||||
"sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3",
|
||||
"sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.3.1"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"typed-ast": {
|
||||
"hashes": [
|
||||
"sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
|
||||
"sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
|
||||
"sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
|
||||
"sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
|
||||
"sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
|
||||
"sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
|
||||
"sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
|
||||
"sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
|
||||
"sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
|
||||
"sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
|
||||
"sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
|
||||
"sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
|
||||
"sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
|
||||
"sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
|
||||
"sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
|
||||
"sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
|
||||
"sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
|
||||
"sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
|
||||
"sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
|
||||
"sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
|
||||
"sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
|
||||
"sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
|
||||
"sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
|
||||
"sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
|
||||
"sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
|
||||
"sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
|
||||
"sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
|
||||
"sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
|
||||
"sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
|
||||
"sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==1.4.3"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497",
|
||||
"sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342",
|
||||
"sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.0"
|
||||
},
|
||||
"yapf": {
|
||||
"hashes": [
|
||||
"sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
|
||||
"sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.31.0"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76",
|
||||
"sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.4.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
104
test_runner/README.md
Normal file
104
test_runner/README.md
Normal file
@@ -0,0 +1,104 @@
|
||||
## Zenith test runner
|
||||
|
||||
This directory contains integration tests.
|
||||
|
||||
Prerequisites:
|
||||
- Python 3.6 or later
|
||||
- Dependencies: install them via `pipenv install`. Note that Debian/Ubuntu
|
||||
packages are stale, as it commonly happens, so manual installation is not
|
||||
recommended.
|
||||
Run `pipenv shell` to activate the venv.
|
||||
- Zenith and Postgres binaries
|
||||
- See the root README.md for build directions
|
||||
- Tests can be run from the git tree; or see the environment variables
|
||||
below to run from other directories.
|
||||
- The zenith git repo, including the postgres submodule
|
||||
(for some tests, e.g. pg_regress)
|
||||
|
||||
### Test Organization
|
||||
|
||||
The tests are divided into a few batches, such that each batch takes roughly
|
||||
the same amount of time. The batches can be run in parallel, to minimize total
|
||||
runtime. Currently, there are only two batches:
|
||||
|
||||
- test_batch_pg_regress: Runs PostgreSQL regression tests
|
||||
- test_others: All other tests
|
||||
|
||||
### Running the tests
|
||||
|
||||
Because pytest will search all subdirectories for tests, it's easiest to
|
||||
run the tests from within the `test_runner` directory.
|
||||
|
||||
Test state (postgres data, pageserver state, and log files) will
|
||||
be stored under a directory `test_output`.
|
||||
|
||||
You can run all the tests with:
|
||||
|
||||
`pytest`
|
||||
|
||||
If you want to run all the tests in a particular file:
|
||||
|
||||
`pytest test_pgbench.py`
|
||||
|
||||
If you want to run all tests that have the string "bench" in their names:
|
||||
|
||||
`pytest -k bench`
|
||||
|
||||
Useful environment variables:
|
||||
|
||||
`ZENITH_BIN`: The directory where zenith binaries can be found.
|
||||
`POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found.
|
||||
`TEST_OUTPUT`: Set the directory where test state and test output files
|
||||
should go.
|
||||
`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
|
||||
|
||||
Let stdout and stderr go to the terminal instead of capturing them:
|
||||
`pytest -s ...`
|
||||
(Note many tests capture subprocess outputs separately, so this may not
|
||||
show much.)
|
||||
|
||||
Exit after the first test failure:
|
||||
`pytest -x ...`
|
||||
(there are many more pytest options; run `pytest -h` to see them.)
|
||||
|
||||
|
||||
### Building new tests
|
||||
|
||||
The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html
|
||||
|
||||
Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
|
||||
|
||||
So this code:
|
||||
|
||||
```python
|
||||
def test_something(zenith_cli, pg_bin):
|
||||
pass
|
||||
```
|
||||
|
||||
... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
|
||||
|
||||
Fixtures can't be imported using the normal python syntax. Instead, use this:
|
||||
|
||||
```python
|
||||
pytest_plugins = ("fixtures.something")
|
||||
```
|
||||
|
||||
That will make all the fixtures in the `fixtures/something.py` file available.
|
||||
|
||||
Anything that's likely to be used in multiple tests should be built into a fixture.
|
||||
|
||||
Note that fixtures can clean up after themselves if they use the `yield` syntax.
|
||||
Cleanup will happen even if the test fails (raises an unhandled exception).
|
||||
Python destructors, e.g. `__del__()` aren't recommended for cleanup.
|
||||
|
||||
|
||||
### Code quality
|
||||
|
||||
Before submitting a patch, please consider:
|
||||
|
||||
* Writing a couple of docstrings to clarify the reasoning behind a new test.
|
||||
* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
|
||||
* Formatting the code with `yapf -r -i .` (TODO: implement an opt-in pre-commit hook for that).
|
||||
* (Optional) Typechecking the code with `mypy .`. Currently this mostly affects `fixtures/zenith_fixtures.py`.
|
||||
|
||||
The tools can be installed with `pipenv install --dev`.
|
||||
73
test_runner/batch_others/test_branch_behind.py
Normal file
73
test_runner/batch_others/test_branch_behind.py
Normal file
@@ -0,0 +1,73 @@
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Create a couple of branches off the main branch, at a historical point in time.
|
||||
#
|
||||
def test_branch_behind(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind", "empty"])
|
||||
|
||||
pgmain = postgres.create_start('test_branch_behind')
|
||||
print("postgres is running on 'test_branch_behind' branch")
|
||||
|
||||
main_pg_conn = pgmain.connect()
|
||||
main_cur = main_pg_conn.cursor()
|
||||
|
||||
# Create table, and insert the first 100 rows
|
||||
main_cur.execute('CREATE TABLE foo (t text)')
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_a = main_cur.fetchone()[0]
|
||||
print('LSN after 100 rows: ' + lsn_a)
|
||||
|
||||
# Insert some more rows. (This generates enough WAL to fill a few segments.)
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_b = main_cur.fetchone()[0]
|
||||
print('LSN after 100100 rows: ' + lsn_b)
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
|
||||
|
||||
# Insert many more rows. This generates enough WAL to fill a few segments.
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_c = main_cur.fetchone()[0]
|
||||
print('LSN after 200100 rows: ' + lsn_c)
|
||||
|
||||
# Branch at the point where only 200 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
|
||||
|
||||
pg_hundred = postgres.create_start("test_branch_behind_hundred")
|
||||
pg_more = postgres.create_start("test_branch_behind_more")
|
||||
|
||||
# On the 'hundred' branch, we should see only 100 rows
|
||||
hundred_pg_conn = pg_hundred.connect()
|
||||
hundred_cur = hundred_pg_conn.cursor()
|
||||
hundred_cur.execute('SELECT count(*) FROM foo')
|
||||
assert hundred_cur.fetchone() == (100, )
|
||||
|
||||
# On the 'more' branch, we should see 100200 rows
|
||||
more_pg_conn = pg_more.connect()
|
||||
more_cur = more_pg_conn.cursor()
|
||||
more_cur.execute('SELECT count(*) FROM foo')
|
||||
assert more_cur.fetchone() == (100100, )
|
||||
|
||||
# All the rows are visible on the main branch
|
||||
main_cur.execute('SELECT count(*) FROM foo')
|
||||
assert main_cur.fetchone() == (200100, )
|
||||
25
test_runner/batch_others/test_bulk_insert.py
Normal file
25
test_runner/batch_others/test_bulk_insert.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from contextlib import closing
|
||||
import psycopg2.extras
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
# Test insertion of larg number of records
|
||||
#
|
||||
# This test is pretty tightly coupled with the current implementation of page version storage
|
||||
# and garbage collection in object_repository.rs.
|
||||
#
|
||||
def test_bulk_insert(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_bulk_insert", "empty"])
|
||||
pg = postgres.create_start('test_bulk_insert')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create table t(c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint)")
|
||||
cur.execute("create index on t(c1)")
|
||||
cur.execute("create index on t(c2)")
|
||||
cur.execute("create index on t(c3)")
|
||||
cur.execute("create index on t(c4)")
|
||||
cur.execute("create index on t(c5)")
|
||||
cur.execute("insert into t values (generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000))")
|
||||
cur.execute("insert into t values (generate_series(1,1000000),random()*1000000,random()*1000000,random()*1000000,random()*1000000)")
|
||||
29
test_runner/batch_others/test_config.py
Normal file
29
test_runner/batch_others/test_config.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from contextlib import closing
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test starting Postgres with custom options
|
||||
#
|
||||
def test_config(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_config", "empty"])
|
||||
|
||||
# change config
|
||||
pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
|
||||
print('postgres is running on test_config branch')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('''
|
||||
SELECT setting
|
||||
FROM pg_settings
|
||||
WHERE
|
||||
source != 'default'
|
||||
AND source != 'override'
|
||||
AND name = 'log_min_messages'
|
||||
''')
|
||||
|
||||
# check that config change was applied
|
||||
assert cur.fetchone() == ('debug1', )
|
||||
32
test_runner/batch_others/test_createdb.py
Normal file
32
test_runner/batch_others/test_createdb.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from contextlib import closing
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test CREATE DATABASE when there have been relmapper changes
|
||||
#
|
||||
def test_createdb(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_createdb", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_createdb')
|
||||
print("postgres is running on 'test_createdb' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Cause a 'relmapper' change in the original branch
|
||||
cur.execute('VACUUM FULL pg_class')
|
||||
|
||||
cur.execute('CREATE DATABASE foodb')
|
||||
|
||||
cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn = cur.fetchone()[0]
|
||||
|
||||
# Create a branch
|
||||
zenith_cli.run(["branch", "test_createdb2", "test_createdb@" + lsn])
|
||||
|
||||
pg2 = postgres.create_start('test_createdb2')
|
||||
|
||||
# Test that you can connect to the new database on both branches
|
||||
for db in (pg, pg2):
|
||||
db.connect(dbname='foodb').close()
|
||||
31
test_runner/batch_others/test_createuser.py
Normal file
31
test_runner/batch_others/test_createuser.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from contextlib import closing
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test CREATE USER to check shared catalog restore
|
||||
#
|
||||
def test_createuser(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_createuser", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_createuser')
|
||||
print("postgres is running on 'test_createuser' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Cause a 'relmapper' change in the original branch
|
||||
cur.execute('CREATE USER testuser with password %s', ('testpwd', ))
|
||||
|
||||
cur.execute('CHECKPOINT')
|
||||
|
||||
cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn = cur.fetchone()[0]
|
||||
|
||||
# Create a branch
|
||||
zenith_cli.run(["branch", "test_createuser2", "test_createuser@" + lsn])
|
||||
|
||||
pg2 = postgres.create_start('test_createuser2')
|
||||
|
||||
# Test that you can connect to new branch as a new user
|
||||
assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )]
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user