mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-14 08:00:38 +00:00
Compare commits
284 Commits
at-063f9ba
...
revert-176
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
78d919ef53 | ||
|
|
65cf1a3221 | ||
|
|
a4aef5d8dc | ||
|
|
ffbb9dd155 | ||
|
|
baf7a81dce | ||
|
|
ee3bcf108d | ||
|
|
0da4046704 | ||
|
|
cbd00d7ed9 | ||
|
|
4c30ae8ba3 | ||
|
|
3da4b3165e | ||
|
|
c1b365fdf7 | ||
|
|
fab104d5f3 | ||
|
|
7dd27ecd20 | ||
|
|
bd2979d02c | ||
|
|
5914aab78a | ||
|
|
4a36d89247 | ||
|
|
432907ff5f | ||
|
|
98da0aa159 | ||
|
|
772c2fb4ff | ||
|
|
b9f84f4a83 | ||
|
|
134eeeb096 | ||
|
|
55ea3f262e | ||
|
|
f03779bf1a | ||
|
|
070c255522 | ||
|
|
9ccbb8d331 | ||
|
|
f2881bbd8a | ||
|
|
a884f4cf6b | ||
|
|
9a0fed0880 | ||
|
|
bea84150b2 | ||
|
|
85b5c0e989 | ||
|
|
e4a70faa08 | ||
|
|
c41549f630 | ||
|
|
c700032dd2 | ||
|
|
33cac863d7 | ||
|
|
51ea9c3053 | ||
|
|
a10cac980f | ||
|
|
081d5dac5e | ||
|
|
cded72a580 | ||
|
|
768c846eeb | ||
|
|
a2561f0a78 | ||
|
|
aa7c601eca | ||
|
|
bf899a57d9 | ||
|
|
07b85e7cfc | ||
|
|
22d997049c | ||
|
|
b683308791 | ||
|
|
51c0f9ab2b | ||
|
|
0030da57a8 | ||
|
|
85884a1599 | ||
|
|
ae20751724 | ||
|
|
5812e26b90 | ||
|
|
ec8861b8cc | ||
|
|
4538f1e1b8 | ||
|
|
b10ae195b7 | ||
|
|
b426775aa0 | ||
|
|
5da4f3a4df | ||
|
|
2bde77fced | ||
|
|
c864091035 | ||
|
|
20361395bb | ||
|
|
b338b5dffe | ||
|
|
5bd879f641 | ||
|
|
e6e883eb12 | ||
|
|
d710dff975 | ||
|
|
6cb14b4200 | ||
|
|
87dfa99734 | ||
|
|
cf59b51519 | ||
|
|
0a7735a656 | ||
|
|
64a602b8f3 | ||
|
|
10e4da3997 | ||
|
|
de37f982db | ||
|
|
d4e155aaa3 | ||
|
|
dd6dca9072 | ||
|
|
ef40e404cf | ||
|
|
11a44eda0e | ||
|
|
30a7598172 | ||
|
|
1ad5658d9c | ||
|
|
954859f6c5 | ||
|
|
4024bfe736 | ||
|
|
2ef0e5c6ed | ||
|
|
52a7e3155e | ||
|
|
ad5eaa6027 | ||
|
|
0f3ec83172 | ||
|
|
c46fe90010 | ||
|
|
bc569dde51 | ||
|
|
02e5083695 | ||
|
|
c4bc604e5f | ||
|
|
b8880bfaab | ||
|
|
e2cf77441d | ||
|
|
b68e3b03ed | ||
|
|
e58c83870f | ||
|
|
b9fd8a36ad | ||
|
|
748c5a577b | ||
|
|
51a0f2683b | ||
|
|
9dfa145c7c | ||
|
|
5642d0b2b8 | ||
|
|
2f83f793bc | ||
|
|
2f9b17b9e5 | ||
|
|
e7cba0b607 | ||
|
|
ff7e9a86c6 | ||
|
|
9ede38b6c4 | ||
|
|
62449d6068 | ||
|
|
baa59512b8 | ||
|
|
87a6c4d051 | ||
|
|
801b749e1d | ||
|
|
5cb501c2b3 | ||
|
|
ad25736f3a | ||
|
|
9a396e1feb | ||
|
|
0323bb5870 | ||
|
|
af0195b604 | ||
|
|
9df8915b03 | ||
|
|
4b1bd32e4a | ||
|
|
68ba6a58a0 | ||
|
|
8f479a712f | ||
|
|
2477d2f9e2 | ||
|
|
992874c916 | ||
|
|
3128e8c75c | ||
|
|
f3f12db2cb | ||
|
|
038ea4c128 | ||
|
|
7e1db8c8a1 | ||
|
|
aa933d3961 | ||
|
|
67b4e38092 | ||
|
|
05f8e6a050 | ||
|
|
76388abeb6 | ||
|
|
2911eb084a | ||
|
|
6cca57f95a | ||
|
|
4a46b01caf | ||
|
|
5c5c3c64f3 | ||
|
|
29539b0561 | ||
|
|
695b5f9d88 | ||
|
|
66694e736a | ||
|
|
091cefaa92 | ||
|
|
aeb4f81c3b | ||
|
|
6391862d8a | ||
|
|
b2e35fffa6 | ||
|
|
8b9d523f3c | ||
|
|
3fd234da07 | ||
|
|
778744d35c | ||
|
|
eabf6f89e4 | ||
|
|
fec050ce97 | ||
|
|
d060a97c54 | ||
|
|
78a6cb247f | ||
|
|
8f6a161271 | ||
|
|
56f6269a8e | ||
|
|
1fb3d08185 | ||
|
|
867aede715 | ||
|
|
d3f356e7a8 | ||
|
|
5f83c9290b | ||
|
|
a4700c9bbe | ||
|
|
dafdf9b952 | ||
|
|
263d60f12d | ||
|
|
abcd7a4b1f | ||
|
|
81cad6277a | ||
|
|
629688fd6c | ||
|
|
9d3779c124 | ||
|
|
334a1d6b5d | ||
|
|
e41ad3be0f | ||
|
|
e113c6fa8d | ||
|
|
cbdfd8c719 | ||
|
|
86bf4301b7 | ||
|
|
9eaa21317c | ||
|
|
e660e12f79 | ||
|
|
ac52f4f2d6 | ||
|
|
5e95338ee9 | ||
|
|
170badd626 | ||
|
|
91fb21225a | ||
|
|
3e6087a12f | ||
|
|
44bfc529f6 | ||
|
|
ef72eb84cf | ||
|
|
a1e34772e5 | ||
|
|
389bd1faeb | ||
|
|
c15aa04714 | ||
|
|
52e0816fa5 | ||
|
|
81417788c8 | ||
|
|
81879f8137 | ||
|
|
5b29774532 | ||
|
|
0ca2bd929b | ||
|
|
9b7dcc2bae | ||
|
|
3136a0754a | ||
|
|
787f0d33f0 | ||
|
|
ed5f9acca9 | ||
|
|
4bc338babc | ||
|
|
3ab090b43a | ||
|
|
7126979950 | ||
|
|
9946cd1125 | ||
|
|
ab20f2c491 | ||
|
|
c9d897f9b6 | ||
|
|
e97f94cc30 | ||
|
|
2cb39a1624 | ||
|
|
93e0ac2b7a | ||
|
|
d5ae9db997 | ||
|
|
9e4de6bed0 | ||
|
|
4a8c663452 | ||
|
|
a009fe912a | ||
|
|
19954dfd8a | ||
|
|
570db6f168 | ||
|
|
cdf04b6a9f | ||
|
|
a0781f229c | ||
|
|
1d36c5a39e | ||
|
|
49da76237b | ||
|
|
1fd08107ca | ||
|
|
58d5136a61 | ||
|
|
87020f8126 | ||
|
|
20414c4b16 | ||
|
|
9b7a8e67a4 | ||
|
|
4af87f3d60 | ||
|
|
0fbe657b2f | ||
|
|
07a9553700 | ||
|
|
dc7e3ff05a | ||
|
|
4f172e7612 | ||
|
|
0e9ee772af | ||
|
|
db63fa64ae | ||
|
|
8e2a6661e9 | ||
|
|
214567bf8f | ||
|
|
c4b57e4b8f | ||
|
|
5d9851f5d1 | ||
|
|
81ba23094e | ||
|
|
d5258cdc4d | ||
|
|
6bc78a0e77 | ||
|
|
6fe443e239 | ||
|
|
d0c246ac3c | ||
|
|
2f784144fe | ||
|
|
222b723354 | ||
|
|
089ba6abfe | ||
|
|
a5a478c321 | ||
|
|
fcf613b6e3 | ||
|
|
572b3f48cf | ||
|
|
bef9b837f1 | ||
|
|
232fe14297 | ||
|
|
92031d376a | ||
|
|
1f0b406b63 | ||
|
|
4c9447589a | ||
|
|
9e5423c867 | ||
|
|
43c16c5145 | ||
|
|
af712798e7 | ||
|
|
f5da652388 | ||
|
|
8745b022a9 | ||
|
|
a40b7cd516 | ||
|
|
1aa8fe43cf | ||
|
|
649f324fe3 | ||
|
|
8609234204 | ||
|
|
5c5629910f | ||
|
|
277e41f4b7 | ||
|
|
ce0243bc12 | ||
|
|
ec3bc74165 | ||
|
|
9594362f74 | ||
|
|
eee0f51e0c | ||
|
|
fd78110c2b | ||
|
|
be6a6958e2 | ||
|
|
0e44887929 | ||
|
|
1aa57fc262 | ||
|
|
9a4f0930c0 | ||
|
|
d88f8b4a7e | ||
|
|
8a901de52a | ||
|
|
a883202495 | ||
|
|
780b46ad27 | ||
|
|
75002adc14 | ||
|
|
07342f7519 | ||
|
|
55de0b88f5 | ||
|
|
d56a0ee19a | ||
|
|
18dfc769d8 | ||
|
|
5e04dad360 | ||
|
|
b8cba059a5 | ||
|
|
e3fa00972e | ||
|
|
b39d1b1717 | ||
|
|
28bc8e3f5c | ||
|
|
6244fd9e7e | ||
|
|
f6b1d76c30 | ||
|
|
edc7bebcb5 | ||
|
|
a201d33edc | ||
|
|
825d363170 | ||
|
|
b9a1a75b0d | ||
|
|
d3a9cb44a6 | ||
|
|
c718870517 | ||
|
|
8437fc056e | ||
|
|
8b8d78a3a0 | ||
|
|
8a86276a6e | ||
|
|
0be7ed0cb5 | ||
|
|
e80ae4306a | ||
|
|
123fcd5d0d | ||
|
|
15434ba7e0 | ||
|
|
a4d0d78e9e | ||
|
|
e13bdd77fe | ||
|
|
bd6bef468c | ||
|
|
77ed2a0fa0 | ||
|
|
37ebbb598d |
4
.circleci/ansible/.gitignore
vendored
Normal file
4
.circleci/ansible/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
zenith_install.tar.gz
|
||||
.zenith_current_version
|
||||
neon_install.tar.gz
|
||||
.neon_current_version
|
||||
@@ -1,67 +1,29 @@
|
||||
- name: Upload Zenith binaries
|
||||
hosts: pageservers:safekeepers
|
||||
- name: Upload Neon binaries
|
||||
hosts: storage
|
||||
gather_facts: False
|
||||
remote_user: admin
|
||||
vars:
|
||||
force_deploy: false
|
||||
|
||||
tasks:
|
||||
|
||||
- name: get latest version of Zenith binaries
|
||||
ignore_errors: true
|
||||
- name: get latest version of Neon binaries
|
||||
register: current_version_file
|
||||
set_fact:
|
||||
current_version: "{{ lookup('file', '.zenith_current_version') | trim }}"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: set zero value for current_version
|
||||
when: current_version_file is failed
|
||||
set_fact:
|
||||
current_version: "0"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: get deployed version from content of remote file
|
||||
ignore_errors: true
|
||||
ansible.builtin.slurp:
|
||||
src: /usr/local/.zenith_current_version
|
||||
register: remote_version_file
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: decode remote file content
|
||||
when: remote_version_file is succeeded
|
||||
set_fact:
|
||||
remote_version: "{{ remote_version_file['content'] | b64decode | trim }}"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: set zero value for remote_version
|
||||
when: remote_version_file is failed
|
||||
set_fact:
|
||||
remote_version: "0"
|
||||
current_version: "{{ lookup('file', '.neon_current_version') | trim }}"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: inform about versions
|
||||
debug: msg="Version to deploy - {{ current_version }}, version on storage node - {{ remote_version }}"
|
||||
debug: msg="Version to deploy - {{ current_version }}"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
|
||||
- name: upload and extract Zenith binaries to /usr/local
|
||||
when: current_version > remote_version or force_deploy
|
||||
- name: upload and extract Neon binaries to /usr/local
|
||||
ansible.builtin.unarchive:
|
||||
owner: root
|
||||
group: root
|
||||
src: zenith_install.tar.gz
|
||||
src: neon_install.tar.gz
|
||||
dest: /usr/local
|
||||
become: true
|
||||
tags:
|
||||
@@ -74,14 +36,24 @@
|
||||
hosts: pageservers
|
||||
gather_facts: False
|
||||
remote_user: admin
|
||||
vars:
|
||||
force_deploy: false
|
||||
|
||||
tasks:
|
||||
|
||||
- name: upload init script
|
||||
when: console_mgmt_base_url is defined
|
||||
ansible.builtin.template:
|
||||
src: scripts/init_pageserver.sh
|
||||
dest: /tmp/init_pageserver.sh
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: init pageserver
|
||||
when: current_version > remote_version or force_deploy
|
||||
shell:
|
||||
cmd: sudo -u pageserver /usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
|
||||
cmd: /tmp/init_pageserver.sh
|
||||
args:
|
||||
creates: "/storage/pageserver/data/tenants"
|
||||
environment:
|
||||
@@ -91,8 +63,20 @@
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: update remote storage (s3) config
|
||||
lineinfile:
|
||||
path: /storage/pageserver/data/pageserver.toml
|
||||
line: "{{ item }}"
|
||||
loop:
|
||||
- "[remote_storage]"
|
||||
- "bucket_name = '{{ bucket_name }}'"
|
||||
- "bucket_region = '{{ bucket_region }}'"
|
||||
- "prefix_in_bucket = '{{ inventory_hostname }}'"
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: upload systemd service definition
|
||||
when: current_version > remote_version or force_deploy
|
||||
ansible.builtin.template:
|
||||
src: systemd/pageserver.service
|
||||
dest: /etc/systemd/system/pageserver.service
|
||||
@@ -104,7 +88,6 @@
|
||||
- pageserver
|
||||
|
||||
- name: start systemd service
|
||||
when: current_version > remote_version or force_deploy
|
||||
ansible.builtin.systemd:
|
||||
daemon_reload: yes
|
||||
name: pageserver
|
||||
@@ -115,7 +98,7 @@
|
||||
- pageserver
|
||||
|
||||
- name: post version to console
|
||||
when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined
|
||||
when: console_mgmt_base_url is defined
|
||||
shell:
|
||||
cmd: |
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
@@ -127,22 +110,42 @@
|
||||
hosts: safekeepers
|
||||
gather_facts: False
|
||||
remote_user: admin
|
||||
vars:
|
||||
force_deploy: false
|
||||
|
||||
tasks:
|
||||
|
||||
- name: upload init script
|
||||
when: console_mgmt_base_url is defined
|
||||
ansible.builtin.template:
|
||||
src: scripts/init_safekeeper.sh
|
||||
dest: /tmp/init_safekeeper.sh
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
become: true
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: init safekeeper
|
||||
shell:
|
||||
cmd: /tmp/init_safekeeper.sh
|
||||
args:
|
||||
creates: "/storage/safekeeper/data/safekeeper.id"
|
||||
environment:
|
||||
ZENITH_REPO_DIR: "/storage/safekeeper/data"
|
||||
LD_LIBRARY_PATH: "/usr/local/lib"
|
||||
become: true
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
# in the future safekeepers should discover pageservers byself
|
||||
# but currently use first pageserver that was discovered
|
||||
- name: set first pageserver var for safekeepers
|
||||
when: current_version > remote_version or force_deploy
|
||||
set_fact:
|
||||
first_pageserver: "{{ hostvars[groups['pageservers'][0]]['inventory_hostname'] }}"
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: upload systemd service definition
|
||||
when: current_version > remote_version or force_deploy
|
||||
ansible.builtin.template:
|
||||
src: systemd/safekeeper.service
|
||||
dest: /etc/systemd/system/safekeeper.service
|
||||
@@ -154,7 +157,6 @@
|
||||
- safekeeper
|
||||
|
||||
- name: start systemd service
|
||||
when: current_version > remote_version or force_deploy
|
||||
ansible.builtin.systemd:
|
||||
daemon_reload: yes
|
||||
name: safekeeper
|
||||
@@ -165,7 +167,7 @@
|
||||
- safekeeper
|
||||
|
||||
- name: post version to console
|
||||
when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined
|
||||
when: console_mgmt_base_url is defined
|
||||
shell:
|
||||
cmd: |
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
|
||||
@@ -4,10 +4,10 @@ set -e
|
||||
|
||||
RELEASE=${RELEASE:-false}
|
||||
|
||||
# look at docker hub for latest tag fo zenith docker image
|
||||
# look at docker hub for latest tag for neon docker image
|
||||
if [ "${RELEASE}" = "true" ]; then
|
||||
echo "search latest relase tag"
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1)
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1)
|
||||
if [ -z "${VERSION}" ]; then
|
||||
echo "no any docker tags found, exiting..."
|
||||
exit 1
|
||||
@@ -16,7 +16,7 @@ if [ "${RELEASE}" = "true" ]; then
|
||||
fi
|
||||
else
|
||||
echo "search latest dev tag"
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep -v release | tail -1)
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1)
|
||||
if [ -z "${VERSION}" ]; then
|
||||
echo "no any docker tags found, exiting..."
|
||||
exit 1
|
||||
@@ -28,25 +28,25 @@ fi
|
||||
echo "found ${VERSION}"
|
||||
|
||||
# do initial cleanup
|
||||
rm -rf zenith_install postgres_install.tar.gz zenith_install.tar.gz .zenith_current_version
|
||||
mkdir zenith_install
|
||||
rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version
|
||||
mkdir neon_install
|
||||
|
||||
# retrive binaries from docker image
|
||||
echo "getting binaries from docker image"
|
||||
docker pull --quiet zenithdb/zenith:${TAG}
|
||||
ID=$(docker create zenithdb/zenith:${TAG})
|
||||
docker pull --quiet neondatabase/neon:${TAG}
|
||||
ID=$(docker create neondatabase/neon:${TAG})
|
||||
docker cp ${ID}:/data/postgres_install.tar.gz .
|
||||
tar -xzf postgres_install.tar.gz -C zenith_install
|
||||
docker cp ${ID}:/usr/local/bin/pageserver zenith_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/safekeeper zenith_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/proxy zenith_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/postgres zenith_install/bin/
|
||||
tar -xzf postgres_install.tar.gz -C neon_install
|
||||
docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/postgres neon_install/bin/
|
||||
docker rm -vf ${ID}
|
||||
|
||||
# store version to file (for ansible playbooks) and create binaries tarball
|
||||
echo ${VERSION} > zenith_install/.zenith_current_version
|
||||
echo ${VERSION} > .zenith_current_version
|
||||
tar -czf zenith_install.tar.gz -C zenith_install .
|
||||
echo ${VERSION} > neon_install/.neon_current_version
|
||||
echo ${VERSION} > .neon_current_version
|
||||
tar -czf neon_install.tar.gz -C neon_install .
|
||||
|
||||
# do final cleaup
|
||||
rm -rf zenith_install postgres_install.tar.gz
|
||||
rm -rf neon_install postgres_install.tar.gz
|
||||
|
||||
19
.circleci/ansible/neon-stress.hosts
Normal file
19
.circleci/ansible/neon-stress.hosts
Normal file
@@ -0,0 +1,19 @@
|
||||
[pageservers]
|
||||
neon-stress-ps-1 console_region_id=1
|
||||
neon-stress-ps-2 console_region_id=1
|
||||
|
||||
[safekeepers]
|
||||
neon-stress-sk-1 console_region_id=1
|
||||
neon-stress-sk-2 console_region_id=1
|
||||
neon-stress-sk-3 console_region_id=1
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
console_mgmt_base_url = http://neon-stress-console.local
|
||||
bucket_name = neon-storage-ireland
|
||||
bucket_region = eu-west-1
|
||||
etcd_endpoints = etcd-stress.local:2379
|
||||
safekeeper_enable_s3_offload = false
|
||||
@@ -1,7 +1,18 @@
|
||||
[pageservers]
|
||||
zenith-1-ps-1
|
||||
zenith-1-ps-1 console_region_id=1
|
||||
|
||||
[safekeepers]
|
||||
zenith-1-sk-1
|
||||
zenith-1-sk-2
|
||||
zenith-1-sk-3
|
||||
zenith-1-sk-1 console_region_id=1
|
||||
zenith-1-sk-2 console_region_id=1
|
||||
zenith-1-sk-3 console_region_id=1
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
console_mgmt_base_url = http://console-release.local
|
||||
bucket_name = zenith-storage-oregon
|
||||
bucket_region = us-west-2
|
||||
etcd_endpoints = etcd-release.local:2379
|
||||
safekeeper_enable_s3_offload = true
|
||||
|
||||
30
.circleci/ansible/scripts/init_pageserver.sh
Normal file
30
.circleci/ansible/scripts/init_pageserver.sh
Normal file
@@ -0,0 +1,30 @@
|
||||
#!/bin/sh
|
||||
|
||||
# get instance id from meta-data service
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
|
||||
# store fqdn hostname in var
|
||||
HOST=$(hostname -f)
|
||||
|
||||
|
||||
cat <<EOF | tee /tmp/payload
|
||||
{
|
||||
"version": 1,
|
||||
"host": "${HOST}",
|
||||
"port": 6400,
|
||||
"region_id": {{ console_region_id }},
|
||||
"instance_id": "${INSTANCE_ID}",
|
||||
"http_host": "${HOST}",
|
||||
"http_port": 9898
|
||||
}
|
||||
EOF
|
||||
|
||||
# check if pageserver already registered or not
|
||||
if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/pageservers/${INSTANCE_ID} -o /dev/null; then
|
||||
|
||||
# not registered, so register it now
|
||||
ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/pageservers -d@/tmp/payload | jq -r '.ID')
|
||||
|
||||
# init pageserver
|
||||
sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
|
||||
fi
|
||||
30
.circleci/ansible/scripts/init_safekeeper.sh
Normal file
30
.circleci/ansible/scripts/init_safekeeper.sh
Normal file
@@ -0,0 +1,30 @@
|
||||
#!/bin/sh
|
||||
|
||||
# get instance id from meta-data service
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
|
||||
# store fqdn hostname in var
|
||||
HOST=$(hostname -f)
|
||||
|
||||
|
||||
cat <<EOF | tee /tmp/payload
|
||||
{
|
||||
"version": 1,
|
||||
"host": "${HOST}",
|
||||
"port": 6500,
|
||||
"region_id": {{ console_region_id }},
|
||||
"instance_id": "${INSTANCE_ID}",
|
||||
"http_host": "${HOST}",
|
||||
"http_port": 7676
|
||||
}
|
||||
EOF
|
||||
|
||||
# check if safekeeper already registered or not
|
||||
if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/safekeepers/${INSTANCE_ID} -o /dev/null; then
|
||||
|
||||
# not registered, so register it now
|
||||
ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers -d@/tmp/payload | jq -r '.ID')
|
||||
|
||||
# init safekeeper
|
||||
sudo -u safekeeper /usr/local/bin/safekeeper --id ${ID} --init -D /storage/safekeeper/data
|
||||
fi
|
||||
@@ -1,7 +1,19 @@
|
||||
[pageservers]
|
||||
zenith-us-stage-ps-1
|
||||
#zenith-us-stage-ps-1 console_region_id=27
|
||||
zenith-us-stage-ps-2 console_region_id=27
|
||||
|
||||
[safekeepers]
|
||||
zenith-us-stage-sk-1
|
||||
zenith-us-stage-sk-2
|
||||
zenith-us-stage-sk-3
|
||||
zenith-us-stage-sk-1 console_region_id=27
|
||||
zenith-us-stage-sk-4 console_region_id=27
|
||||
zenith-us-stage-sk-5 console_region_id=27
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
console_mgmt_base_url = http://console-staging.local
|
||||
bucket_name = zenith-staging-storage-us-east-1
|
||||
bucket_region = us-east-1
|
||||
etcd_endpoints = etcd-staging.local:2379
|
||||
safekeeper_enable_s3_offload = false
|
||||
|
||||
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=pageserver
|
||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /storage/pageserver/data
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }}
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
version: 2.1
|
||||
|
||||
executors:
|
||||
zenith-xlarge-executor:
|
||||
neon-xlarge-executor:
|
||||
resource_class: xlarge
|
||||
docker:
|
||||
# NB: when changed, do not forget to update rust image tag in all Dockerfiles
|
||||
- image: zimg/rust:1.56
|
||||
zenith-executor:
|
||||
- image: zimg/rust:1.58
|
||||
neon-executor:
|
||||
docker:
|
||||
- image: zimg/rust:1.56
|
||||
- image: zimg/rust:1.58
|
||||
|
||||
jobs:
|
||||
check-codestyle-rust:
|
||||
executor: zenith-xlarge-executor
|
||||
executor: neon-xlarge-executor
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
@@ -22,7 +22,7 @@ jobs:
|
||||
|
||||
# A job to build postgres
|
||||
build-postgres:
|
||||
executor: zenith-xlarge-executor
|
||||
executor: neon-xlarge-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
@@ -34,10 +34,13 @@ jobs:
|
||||
- checkout
|
||||
|
||||
# Grab the postgres git revision to build a cache key.
|
||||
# Append makefile as it could change the way postgres is built.
|
||||
# Note this works even though the submodule hasn't been checkout out yet.
|
||||
- run:
|
||||
name: Get postgres cache key
|
||||
command: git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
|
||||
command: |
|
||||
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
|
||||
cat Makefile >> /tmp/cache-key-postgres
|
||||
|
||||
- restore_cache:
|
||||
name: Restore postgres cache
|
||||
@@ -64,9 +67,9 @@ jobs:
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
# A job to build zenith rust code
|
||||
build-zenith:
|
||||
executor: zenith-xlarge-executor
|
||||
# A job to build Neon rust code
|
||||
build-neon:
|
||||
executor: neon-xlarge-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
@@ -78,11 +81,14 @@ jobs:
|
||||
- checkout
|
||||
|
||||
# Grab the postgres git revision to build a cache key.
|
||||
# Append makefile as it could change the way postgres is built.
|
||||
# Note this works even though the submodule hasn't been checkout out yet.
|
||||
- run:
|
||||
name: Get postgres cache key
|
||||
command: |
|
||||
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
|
||||
cat Makefile >> /tmp/cache-key-postgres
|
||||
|
||||
|
||||
- restore_cache:
|
||||
name: Restore postgres cache
|
||||
@@ -107,11 +113,16 @@ jobs:
|
||||
CARGO_FLAGS=
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
CARGO_FLAGS=--release
|
||||
CARGO_FLAGS="--release --features profiling"
|
||||
fi
|
||||
|
||||
export CARGO_INCREMENTAL=0
|
||||
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests
|
||||
export CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||
export RUSTC_WRAPPER=cachepot
|
||||
export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
|
||||
export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
|
||||
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||
cachepot -s
|
||||
|
||||
- save_cache:
|
||||
name: Save rust cache
|
||||
@@ -121,31 +132,19 @@ jobs:
|
||||
- ~/.cargo/git
|
||||
- target
|
||||
|
||||
# Run style checks
|
||||
# has to run separately from cargo fmt section
|
||||
# since needs to run with dependencies
|
||||
- run:
|
||||
name: cargo clippy
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
"${cov_prefix[@]}" ./run_clippy.sh
|
||||
|
||||
# Run rust unit tests
|
||||
- run:
|
||||
name: cargo test
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
CARGO_FLAGS=
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
CARGO_FLAGS=--release
|
||||
fi
|
||||
|
||||
"${cov_prefix[@]}" cargo test
|
||||
"${cov_prefix[@]}" cargo test $CARGO_FLAGS
|
||||
|
||||
# Install the rust binaries, for use by test jobs
|
||||
- run:
|
||||
@@ -210,19 +209,25 @@ jobs:
|
||||
- "*"
|
||||
|
||||
check-codestyle-python:
|
||||
executor: zenith-executor
|
||||
executor: neon-executor
|
||||
steps:
|
||||
- checkout
|
||||
- restore_cache:
|
||||
keys:
|
||||
- v1-python-deps-{{ checksum "poetry.lock" }}
|
||||
- v2-python-deps-{{ checksum "poetry.lock" }}
|
||||
- run:
|
||||
name: Install deps
|
||||
command: ./scripts/pysync
|
||||
- save_cache:
|
||||
key: v1-python-deps-{{ checksum "poetry.lock" }}
|
||||
key: v2-python-deps-{{ checksum "poetry.lock" }}
|
||||
paths:
|
||||
- /home/circleci/.cache/pypoetry/virtualenvs
|
||||
- run:
|
||||
name: Print versions
|
||||
when: always
|
||||
command: |
|
||||
poetry run python --version
|
||||
poetry show
|
||||
- run:
|
||||
name: Run yapf to ensure code format
|
||||
when: always
|
||||
@@ -233,7 +238,7 @@ jobs:
|
||||
command: poetry run mypy .
|
||||
|
||||
run-pytest:
|
||||
executor: zenith-executor
|
||||
executor: neon-executor
|
||||
parameters:
|
||||
# pytest args to specify the tests to run.
|
||||
#
|
||||
@@ -274,12 +279,12 @@ jobs:
|
||||
- run: git submodule update --init --depth 1
|
||||
- restore_cache:
|
||||
keys:
|
||||
- v1-python-deps-{{ checksum "poetry.lock" }}
|
||||
- v2-python-deps-{{ checksum "poetry.lock" }}
|
||||
- run:
|
||||
name: Install deps
|
||||
command: ./scripts/pysync
|
||||
- save_cache:
|
||||
key: v1-python-deps-{{ checksum "poetry.lock" }}
|
||||
key: v2-python-deps-{{ checksum "poetry.lock" }}
|
||||
paths:
|
||||
- /home/circleci/.cache/pypoetry/virtualenvs
|
||||
- run:
|
||||
@@ -356,7 +361,7 @@ jobs:
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "etcd.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
@@ -377,7 +382,7 @@ jobs:
|
||||
- "*"
|
||||
|
||||
coverage-report:
|
||||
executor: zenith-xlarge-executor
|
||||
executor: neon-xlarge-executor
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
@@ -392,7 +397,7 @@ jobs:
|
||||
- run:
|
||||
name: Build coverage report
|
||||
command: |
|
||||
COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
|
||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/coverage \
|
||||
--dir=/tmp/zenith/coverage report \
|
||||
@@ -403,11 +408,11 @@ jobs:
|
||||
name: Upload coverage report
|
||||
command: |
|
||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
||||
REPORT_URL=https://zenithdb.github.io/zenith-coverage-data/$CIRCLE_SHA1
|
||||
COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
|
||||
REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1
|
||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/git-upload \
|
||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-coverage-data.git \
|
||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \
|
||||
--message="Add code coverage for $COMMIT_URL" \
|
||||
copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
|
||||
|
||||
@@ -424,7 +429,7 @@ jobs:
|
||||
\"target_url\": \"$REPORT_URL\"
|
||||
}"
|
||||
|
||||
# Build zenithdb/zenith:latest image and push it to Docker hub
|
||||
# Build neondatabase/neon:latest image and push it to Docker hub
|
||||
docker-image:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -438,18 +443,18 @@ jobs:
|
||||
- run:
|
||||
name: Build and push Docker image
|
||||
command: |
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
docker build \
|
||||
--pull \
|
||||
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:latest .
|
||||
docker push zenithdb/zenith:${DOCKER_TAG}
|
||||
docker push zenithdb/zenith:latest
|
||||
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest .
|
||||
docker push neondatabase/neon:${DOCKER_TAG}
|
||||
docker push neondatabase/neon:latest
|
||||
|
||||
# Build zenithdb/compute-node:latest image and push it to Docker hub
|
||||
# Build neondatabase/compute-node:latest image and push it to Docker hub
|
||||
docker-image-compute:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -457,28 +462,31 @@ jobs:
|
||||
- checkout
|
||||
- setup_remote_docker:
|
||||
docker_layer_caching: true
|
||||
# Build zenithdb/compute-tools:latest image and push it to Docker hub
|
||||
# Build neondatabase/compute-tools:latest image and push it to Docker hub
|
||||
# TODO: this should probably also use versioned tag, not just :latest.
|
||||
# XXX: but should it? We build and use it only locally now.
|
||||
- run:
|
||||
name: Build and push compute-tools Docker image
|
||||
command: |
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
docker build -t zenithdb/compute-tools:latest -f Dockerfile.compute-tools .
|
||||
docker push zenithdb/compute-tools:latest
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
docker build \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/compute-tools:latest -f Dockerfile.compute-tools .
|
||||
docker push neondatabase/compute-tools:latest
|
||||
- run:
|
||||
name: Init postgres submodule
|
||||
command: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Build and push compute-node Docker image
|
||||
command: |
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:latest vendor/postgres
|
||||
docker push zenithdb/compute-node:${DOCKER_TAG}
|
||||
docker push zenithdb/compute-node:latest
|
||||
docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:latest vendor/postgres
|
||||
docker push neondatabase/compute-node:${DOCKER_TAG}
|
||||
docker push neondatabase/compute-node:latest
|
||||
|
||||
# Build production zenithdb/zenith:release image and push it to Docker hub
|
||||
# Build production neondatabase/neon:release image and push it to Docker hub
|
||||
docker-image-release:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -492,18 +500,18 @@ jobs:
|
||||
- run:
|
||||
name: Build and push Docker image
|
||||
command: |
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
docker build \
|
||||
--pull \
|
||||
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:release .
|
||||
docker push zenithdb/zenith:${DOCKER_TAG}
|
||||
docker push zenithdb/zenith:release
|
||||
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release .
|
||||
docker push neondatabase/neon:${DOCKER_TAG}
|
||||
docker push neondatabase/neon:release
|
||||
|
||||
# Build production zenithdb/compute-node:release image and push it to Docker hub
|
||||
# Build production neondatabase/compute-node:release image and push it to Docker hub
|
||||
docker-image-compute-release:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -511,26 +519,29 @@ jobs:
|
||||
- checkout
|
||||
- setup_remote_docker:
|
||||
docker_layer_caching: true
|
||||
# Build zenithdb/compute-tools:release image and push it to Docker hub
|
||||
# Build neondatabase/compute-tools:release image and push it to Docker hub
|
||||
# TODO: this should probably also use versioned tag, not just :latest.
|
||||
# XXX: but should it? We build and use it only locally now.
|
||||
- run:
|
||||
name: Build and push compute-tools Docker image
|
||||
command: |
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
docker build -t zenithdb/compute-tools:release -f Dockerfile.compute-tools .
|
||||
docker push zenithdb/compute-tools:release
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
docker build \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/compute-tools:release -f Dockerfile.compute-tools .
|
||||
docker push neondatabase/compute-tools:release
|
||||
- run:
|
||||
name: Init postgres submodule
|
||||
command: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Build and push compute-node Docker image
|
||||
command: |
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:release vendor/postgres
|
||||
docker push zenithdb/compute-node:${DOCKER_TAG}
|
||||
docker push zenithdb/compute-node:release
|
||||
docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:release vendor/postgres
|
||||
docker push neondatabase/compute-node:${DOCKER_TAG}
|
||||
docker push neondatabase/compute-node:release
|
||||
|
||||
deploy-staging:
|
||||
docker:
|
||||
@@ -556,7 +567,7 @@ jobs:
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i staging.hosts
|
||||
rm -f zenith_install.tar.gz .zenith_current_version
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-staging-proxy:
|
||||
docker:
|
||||
@@ -574,13 +585,62 @@ jobs:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add zenithdb https://zenithdb.github.io/helm-charts
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
deploy-neon-stress:
|
||||
docker:
|
||||
- image: cimg/python:3.10
|
||||
steps:
|
||||
- checkout
|
||||
- setup_remote_docker
|
||||
- run:
|
||||
name: Setup ansible
|
||||
command: |
|
||||
pip install --progress-bar off --user ansible boto3
|
||||
- run:
|
||||
name: Redeploy
|
||||
command: |
|
||||
cd "$(pwd)/.circleci/ansible"
|
||||
|
||||
./get_binaries.sh
|
||||
|
||||
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
|
||||
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
||||
chmod 0600 ssh-key
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i neon-stress.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-neon-stress-proxy:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
environment:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Store kubeconfig file
|
||||
command: |
|
||||
echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
|
||||
chmod 0600 ${KUBECONFIG}
|
||||
- run:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
deploy-release:
|
||||
docker:
|
||||
@@ -605,8 +665,8 @@ jobs:
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i production.hosts -e console_mgmt_base_url=http://console-release.local
|
||||
rm -f zenith_install.tar.gz .zenith_current_version
|
||||
ansible-playbook deploy.yaml -i production.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-release-proxy:
|
||||
docker:
|
||||
@@ -624,7 +684,7 @@ jobs:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add zenithdb https://zenithdb.github.io/helm-charts
|
||||
helm repo add zenithdb https://neondatabase.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
@@ -653,7 +713,7 @@ jobs:
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"pending\",
|
||||
\"context\": \"zenith-remote-ci\",
|
||||
\"context\": \"neon-cloud-e2e\",
|
||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||
}"
|
||||
- run:
|
||||
@@ -669,7 +729,7 @@ jobs:
|
||||
"{
|
||||
\"ref\": \"main\",
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"zenith-remote-ci\",
|
||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||
\"commit_hash\": \"$CIRCLE_SHA1\",
|
||||
\"remote_repo\": \"$LOCAL_REPO\"
|
||||
}
|
||||
@@ -685,8 +745,8 @@ workflows:
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
- build-zenith:
|
||||
name: build-zenith-<< matrix.build_type >>
|
||||
- build-neon:
|
||||
name: build-neon-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
@@ -701,7 +761,7 @@ workflows:
|
||||
test_selection: batch_pg_regress
|
||||
needs_postgres_source: true
|
||||
requires:
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
- build-neon-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: other-tests-<< matrix.build_type >>
|
||||
matrix:
|
||||
@@ -709,7 +769,7 @@ workflows:
|
||||
build_type: ["debug", "release"]
|
||||
test_selection: batch_others
|
||||
requires:
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
- build-neon-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: benchmarks
|
||||
context: PERF_TEST_RESULT_CONNSTR
|
||||
@@ -718,7 +778,7 @@ workflows:
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
requires:
|
||||
- build-zenith-release
|
||||
- build-neon-release
|
||||
- coverage-report:
|
||||
# Context passes credentials for gh api
|
||||
context: CI_ACCESS_TOKEN
|
||||
@@ -766,6 +826,25 @@ workflows:
|
||||
requires:
|
||||
- docker-image
|
||||
|
||||
- deploy-neon-stress:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- docker-image
|
||||
- deploy-neon-stress-proxy:
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- docker-image
|
||||
|
||||
- docker-image-release:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
@@ -809,11 +888,11 @@ workflows:
|
||||
- remote-ci-trigger:
|
||||
# Context passes credentials for gh api
|
||||
context: CI_ACCESS_TOKEN
|
||||
remote_repo: "zenithdb/console"
|
||||
remote_repo: "neondatabase/cloud"
|
||||
requires:
|
||||
# XXX: Successful build doesn't mean everything is OK, but
|
||||
# the job to be triggered takes so much time to complete (~22 min)
|
||||
# that it's better not to wait for the commented-out steps
|
||||
- build-zenith-debug
|
||||
- build-neon-release
|
||||
# - pg_regress-tests-release
|
||||
# - other-tests-release
|
||||
|
||||
34
.circleci/helm-values/neon-stress.proxy.yaml
Normal file
34
.circleci/helm-values/neon-stress.proxy.yaml
Normal file
@@ -0,0 +1,34 @@
|
||||
fullnameOverride: "neon-stress-proxy"
|
||||
|
||||
settings:
|
||||
authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.dev.neon.tech/psql_session/"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy
|
||||
zenith_env: staging
|
||||
zenith_region: eu-west-1
|
||||
zenith_region_slug: ireland
|
||||
|
||||
service:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
|
||||
external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local
|
||||
type: LoadBalancer
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
@@ -1,9 +1,12 @@
|
||||
# Helm chart values for zenith-proxy.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authEndpoint: "https://console.zenith.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.zenith.tech/psql_session/"
|
||||
authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.neon.tech/psql_session/"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
@@ -25,7 +28,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: start.zenith.tech
|
||||
external-dns.alpha.kubernetes.io/hostname: start.zenith.tech,connect.neon.tech,pg.neon.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
|
||||
31
.circleci/helm-values/staging.proxy-scram.yaml
Normal file
31
.circleci/helm-values/staging.proxy-scram.yaml
Normal file
@@ -0,0 +1,31 @@
|
||||
# Helm chart values for zenith-proxy.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: staging
|
||||
zenith_region: us-east-1
|
||||
zenith_region_slug: virginia
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
@@ -1,9 +1,12 @@
|
||||
# Helm chart values for zenith-proxy.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authEndpoint: "https://console.stage.zenith.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.zenith.tech/psql_session/"
|
||||
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
@@ -17,7 +20,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: start.stage.zenith.tech
|
||||
external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
|
||||
26
.config/hakari.toml
Normal file
26
.config/hakari.toml
Normal file
@@ -0,0 +1,26 @@
|
||||
# This file contains settings for `cargo hakari`.
|
||||
# See https://docs.rs/cargo-hakari/latest/cargo_hakari/config for a full list of options.
|
||||
|
||||
hakari-package = "workspace_hack"
|
||||
|
||||
# Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above.
|
||||
dep-format-version = "2"
|
||||
|
||||
# Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended.
|
||||
# Hakari works much better with the new feature resolver.
|
||||
# For more about the new feature resolver, see:
|
||||
# https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#cargos-new-feature-resolver
|
||||
# Have to keep the resolver still here since hakari requires this field,
|
||||
# despite it's now the default for 2021 edition & cargo.
|
||||
resolver = "2"
|
||||
|
||||
# Add triples corresponding to platforms commonly used by developers here.
|
||||
# https://doc.rust-lang.org/rustc/platform-support.html
|
||||
platforms = [
|
||||
# "x86_64-unknown-linux-gnu",
|
||||
# "x86_64-apple-darwin",
|
||||
# "x86_64-pc-windows-msvc",
|
||||
]
|
||||
|
||||
# Write out exact versions rather than a semver range. (Defaults to false.)
|
||||
# exact-versions = true
|
||||
13
.github/workflows/benchmarking.yml
vendored
13
.github/workflows/benchmarking.yml
vendored
@@ -26,7 +26,7 @@ jobs:
|
||||
runs-on: [self-hosted, zenith-benchmarker]
|
||||
|
||||
env:
|
||||
PG_BIN: "/usr/pgsql-13/bin"
|
||||
POSTGRES_DISTRIB_DIR: "/usr/pgsql-13"
|
||||
|
||||
steps:
|
||||
- name: Checkout zenith repo
|
||||
@@ -51,7 +51,7 @@ jobs:
|
||||
echo Poetry
|
||||
poetry --version
|
||||
echo Pgbench
|
||||
$PG_BIN/pgbench --version
|
||||
$POSTGRES_DISTRIB_DIR/bin/pgbench --version
|
||||
|
||||
# FIXME cluster setup is skipped due to various changes in console API
|
||||
# for now pre created cluster is used. When API gain some stability
|
||||
@@ -66,7 +66,7 @@ jobs:
|
||||
|
||||
echo "Starting cluster"
|
||||
# wake up the cluster
|
||||
$PG_BIN/psql $BENCHMARK_CONNSTR -c "SELECT 1"
|
||||
$POSTGRES_DISTRIB_DIR/bin/psql $BENCHMARK_CONNSTR -c "SELECT 1"
|
||||
|
||||
- name: Run benchmark
|
||||
# pgbench is installed system wide from official repo
|
||||
@@ -83,8 +83,11 @@ jobs:
|
||||
# sudo yum install postgresql13-contrib
|
||||
# actual binaries are located in /usr/pgsql-13/bin/
|
||||
env:
|
||||
TEST_PG_BENCH_TRANSACTIONS_MATRIX: "5000,10000,20000"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,15"
|
||||
# The pgbench test runs two tests of given duration against each scale.
|
||||
# So the total runtime with these parameters is 2 * 2 * 300 = 1200, or 20 minutes.
|
||||
# Plus time needed to initialize the test databases.
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
PLATFORM: "zenith-staging"
|
||||
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
|
||||
|
||||
27
.github/workflows/testing.yml
vendored
27
.github/workflows/testing.yml
vendored
@@ -1,10 +1,8 @@
|
||||
name: Build and Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
push:
|
||||
|
||||
jobs:
|
||||
regression-check:
|
||||
@@ -13,7 +11,7 @@ jobs:
|
||||
# If we want to duplicate this job for different
|
||||
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
|
||||
rust_toolchain: [stable]
|
||||
os: [ubuntu-latest]
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
timeout-minutes: 30
|
||||
name: run regression test suite
|
||||
runs-on: ${{ matrix.os }}
|
||||
@@ -32,11 +30,16 @@ jobs:
|
||||
toolchain: ${{ matrix.rust_toolchain }}
|
||||
override: true
|
||||
|
||||
- name: Install postgres dependencies
|
||||
- name: Install Ubuntu postgres dependencies
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
|
||||
|
||||
- name: Install macOs postgres dependencies
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: brew install flex bison
|
||||
|
||||
- name: Set pg revision for caching
|
||||
id: pg_ver
|
||||
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
|
||||
@@ -51,8 +54,7 @@ jobs:
|
||||
|
||||
- name: Build postgres
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
make postgres
|
||||
run: make postgres
|
||||
|
||||
- name: Cache cargo deps
|
||||
id: cache_cargo
|
||||
@@ -62,13 +64,10 @@ jobs:
|
||||
~/.cargo/registry
|
||||
~/.cargo/git
|
||||
target
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}
|
||||
|
||||
# Use `env CARGO_INCREMENTAL=0` to mitigate https://github.com/rust-lang/rust/issues/91696 for rustc 1.57.0
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
env CARGO_INCREMENTAL=0 cargo build --workspace --bins --examples --tests
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
|
||||
- name: Run cargo test
|
||||
run: |
|
||||
env CARGO_INCREMENTAL=0 cargo test -- --nocapture --test-threads=1
|
||||
run: cargo test --all --all-targets
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -11,3 +11,6 @@ test_output/
|
||||
# Coverage
|
||||
*.profraw
|
||||
*.profdata
|
||||
|
||||
*.key
|
||||
*.crt
|
||||
|
||||
1835
Cargo.lock
generated
1835
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
14
Cargo.toml
14
Cargo.toml
@@ -3,13 +3,11 @@ members = [
|
||||
"compute_tools",
|
||||
"control_plane",
|
||||
"pageserver",
|
||||
"postgres_ffi",
|
||||
"proxy",
|
||||
"walkeeper",
|
||||
"safekeeper",
|
||||
"workspace_hack",
|
||||
"zenith",
|
||||
"zenith_metrics",
|
||||
"zenith_utils",
|
||||
"neon_local",
|
||||
"libs/*",
|
||||
]
|
||||
|
||||
[profile.release]
|
||||
@@ -17,7 +15,7 @@ members = [
|
||||
# Besides, debug info should not affect the performance.
|
||||
debug = true
|
||||
|
||||
# This is only needed for proxy's tests
|
||||
# TODO: we should probably fork tokio-postgres-rustls instead
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
[patch.crates-io]
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
|
||||
18
Dockerfile
18
Dockerfile
@@ -1,7 +1,5 @@
|
||||
# Build Postgres
|
||||
#
|
||||
#FROM zimg/rust:1.56 AS pg-build
|
||||
FROM zenithdb/build:buster-20220309 AS pg-build
|
||||
FROM zimg/rust:1.58 AS pg-build
|
||||
WORKDIR /pg
|
||||
|
||||
USER root
|
||||
@@ -11,26 +9,26 @@ COPY Makefile Makefile
|
||||
|
||||
ENV BUILD_TYPE release
|
||||
RUN set -e \
|
||||
&& make -j $(nproc) -s postgres \
|
||||
&& mold -run make -j $(nproc) -s postgres \
|
||||
&& rm -rf tmp_install/build \
|
||||
&& tar -C tmp_install -czf /postgres_install.tar.gz .
|
||||
|
||||
# Build zenith binaries
|
||||
#
|
||||
#FROM zimg/rust:1.56 AS build
|
||||
FROM zenithdb/build:buster-20220309 AS build
|
||||
FROM zimg/rust:1.58 AS build
|
||||
ARG GIT_VERSION=local
|
||||
|
||||
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||
ARG AWS_ACCESS_KEY_ID
|
||||
ARG AWS_SECRET_ACCESS_KEY
|
||||
#ENV RUSTC_WRAPPER cachepot
|
||||
ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot
|
||||
|
||||
COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY . .
|
||||
|
||||
RUN cargo build --release
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats.
|
||||
RUN set -e \
|
||||
&& sudo -E "PATH=$PATH" mold -run cargo build --release \
|
||||
&& cachepot -s
|
||||
|
||||
# Build final image
|
||||
#
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
FROM rust:1.56.1-slim-buster
|
||||
WORKDIR /home/circleci/project
|
||||
|
||||
RUN set -e \
|
||||
&& apt-get update \
|
||||
&& apt-get -yq install \
|
||||
automake \
|
||||
libtool \
|
||||
build-essential \
|
||||
bison \
|
||||
flex \
|
||||
libreadline-dev \
|
||||
zlib1g-dev \
|
||||
libxml2-dev \
|
||||
libseccomp-dev \
|
||||
pkg-config \
|
||||
libssl-dev \
|
||||
clang
|
||||
|
||||
RUN set -e \
|
||||
&& rustup component add clippy \
|
||||
&& cargo install cargo-audit \
|
||||
&& cargo install --git https://github.com/paritytech/cachepot
|
||||
@@ -1,14 +1,18 @@
|
||||
# First transient image to build compute_tools binaries
|
||||
# NB: keep in sync with rust image version in .circle/config.yml
|
||||
FROM rust:1.56.1-slim-buster AS rust-build
|
||||
FROM zimg/rust:1.58 AS rust-build
|
||||
|
||||
WORKDIR /zenith
|
||||
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||
ARG AWS_ACCESS_KEY_ID
|
||||
ARG AWS_SECRET_ACCESS_KEY
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN cargo build -p compute_tools --release
|
||||
RUN set -e \
|
||||
&& sudo -E "PATH=$PATH" mold -run cargo build -p compute_tools --release \
|
||||
&& cachepot -s
|
||||
|
||||
# Final image that only has one binary
|
||||
FROM debian:buster-slim
|
||||
|
||||
COPY --from=rust-build /zenith/target/release/zenith_ctl /usr/local/bin/zenith_ctl
|
||||
COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
9
Makefile
9
Makefile
@@ -12,12 +12,12 @@ endif
|
||||
#
|
||||
BUILD_TYPE ?= debug
|
||||
ifeq ($(BUILD_TYPE),release)
|
||||
PG_CONFIGURE_OPTS = --enable-debug
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
|
||||
PG_CFLAGS = -O2 -g3 $(CFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CFLAGS = -O0 -g3 $(CFLAGS)
|
||||
else
|
||||
$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
|
||||
@@ -78,6 +78,11 @@ postgres: postgres-configure \
|
||||
$(MAKE) -C tmp_install/build/contrib/zenith install
|
||||
+@echo "Compiling contrib/zenith_test_utils"
|
||||
$(MAKE) -C tmp_install/build/contrib/zenith_test_utils install
|
||||
+@echo "Compiling pg_buffercache"
|
||||
$(MAKE) -C tmp_install/build/contrib/pg_buffercache install
|
||||
+@echo "Compiling pageinspect"
|
||||
$(MAKE) -C tmp_install/build/contrib/pageinspect install
|
||||
|
||||
|
||||
.PHONY: postgres-clean
|
||||
postgres-clean:
|
||||
|
||||
150
README.md
150
README.md
@@ -1,25 +1,30 @@
|
||||
# Zenith
|
||||
# Neon
|
||||
|
||||
Zenith is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes.
|
||||
Neon is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes.
|
||||
|
||||
The project used to be called "Zenith". Many of the commands and code comments
|
||||
still refer to "zenith", but we are in the process of renaming things.
|
||||
|
||||
## Architecture overview
|
||||
|
||||
A Zenith installation consists of compute nodes and Zenith storage engine.
|
||||
A Neon installation consists of compute nodes and Neon storage engine.
|
||||
|
||||
Compute nodes are stateless PostgreSQL nodes, backed by Zenith storage engine.
|
||||
Compute nodes are stateless PostgreSQL nodes, backed by Neon storage engine.
|
||||
|
||||
Zenith storage engine consists of two major components:
|
||||
Neon storage engine consists of two major components:
|
||||
- Pageserver. Scalable storage backend for compute nodes.
|
||||
- WAL service. The service that receives WAL from compute node and ensures that it is stored durably.
|
||||
|
||||
Pageserver consists of:
|
||||
- Repository - Zenith storage implementation.
|
||||
- Repository - Neon storage implementation.
|
||||
- WAL receiver - service that receives WAL from WAL service and stores it in the repository.
|
||||
- Page service - service that communicates with compute nodes and responds with pages from the repository.
|
||||
- WAL redo - service that builds pages from base images and WAL records on Page service request.
|
||||
|
||||
## Running local installation
|
||||
|
||||
|
||||
#### building on Ubuntu/ Debian (Linux)
|
||||
1. Install build dependencies and other useful packages
|
||||
|
||||
On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||
@@ -28,53 +33,89 @@ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libsec
|
||||
libssl-dev clang pkg-config libpq-dev
|
||||
```
|
||||
|
||||
[Rust] 1.56.1 or later is also required.
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
# recommended approach from https://www.rust-lang.org/tools/install
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
```
|
||||
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
|
||||
3. Install PostgreSQL Client
|
||||
```
|
||||
apt install postgresql-client
|
||||
```
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
|
||||
|
||||
2. Build zenith and patched postgres
|
||||
4. Build neon and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/zenithdb/zenith.git
|
||||
cd zenith
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
cd neon
|
||||
make -j5
|
||||
```
|
||||
|
||||
3. Start pageserver and postgres on top of it (should be called from repo root):
|
||||
#### building on OSX (12.3.1)
|
||||
1. Install XCode
|
||||
```
|
||||
xcode-select --install
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
# recommended approach from https://www.rust-lang.org/tools/install
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
```
|
||||
|
||||
3. Install PostgreSQL Client
|
||||
```
|
||||
# from https://stackoverflow.com/questions/44654216/correct-way-to-install-psql-without-full-postgres-on-macos
|
||||
brew install libpq
|
||||
brew link --force libpq
|
||||
```
|
||||
|
||||
4. Build neon and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
cd neon
|
||||
make -j5
|
||||
```
|
||||
|
||||
#### dependency installation notes
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
|
||||
|
||||
|
||||
#### running neon database
|
||||
1. Start pageserver and postgres on top of it (should be called from repo root):
|
||||
```sh
|
||||
# Create repository in .zenith with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
> ./target/debug/zenith init
|
||||
initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
|
||||
created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
|
||||
created main branch
|
||||
> ./target/debug/neon_local init
|
||||
initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
|
||||
created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50
|
||||
initial timeline de200bd42b49cc1814412c7e592dd6e9 created
|
||||
pageserver init succeeded
|
||||
|
||||
# start pageserver and safekeeper
|
||||
> ./target/debug/zenith start
|
||||
Starting pageserver at 'localhost:64000' in '.zenith'
|
||||
> ./target/debug/neon_local start
|
||||
Starting pageserver at '127.0.0.1:64000' in '.zenith'
|
||||
Pageserver started
|
||||
initializing for single for 7676
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single'
|
||||
initializing for sk 1 for 7676
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1'
|
||||
Safekeeper started
|
||||
|
||||
# start postgres compute node
|
||||
> ./target/debug/zenith pg start main
|
||||
Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
|
||||
> ./target/debug/neon_local pg start main
|
||||
Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
|
||||
Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
|
||||
waiting for server to start.... done
|
||||
server started
|
||||
|
||||
# check list of running postgres instances
|
||||
> ./target/debug/zenith pg list
|
||||
NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running
|
||||
> ./target/debug/neon_local pg list
|
||||
NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running
|
||||
```
|
||||
|
||||
4. Now it is possible to connect to postgres and run some queries:
|
||||
2. Now it is possible to connect to postgres and run some queries:
|
||||
```text
|
||||
> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
@@ -88,21 +129,28 @@ postgres=# select * from t;
|
||||
(1 row)
|
||||
```
|
||||
|
||||
5. And create branches and run postgres on them:
|
||||
3. And create branches and run postgres on them:
|
||||
```sh
|
||||
# create branch named migration_check
|
||||
> ./target/debug/zenith timeline branch --branch-name migration_check
|
||||
Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main'
|
||||
> ./target/debug/neon_local timeline branch --branch-name migration_check
|
||||
Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'
|
||||
|
||||
# check branches tree
|
||||
> ./target/debug/zenith timeline list
|
||||
main [5b014a9e41b4b63ce1a1febc04503636]
|
||||
┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9]
|
||||
> ./target/debug/neon_local timeline list
|
||||
(L) main [de200bd42b49cc1814412c7e592dd6e9]
|
||||
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
|
||||
|
||||
# start postgres on that branch
|
||||
> ./target/debug/zenith pg start migration_check
|
||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
|
||||
waiting for server to start.... done
|
||||
> ./target/debug/neon_local pg start migration_check --branch-name migration_check
|
||||
Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
|
||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=postgres'
|
||||
|
||||
# check the new list of running postgres instances
|
||||
> ./target/debug/neon_local pg list
|
||||
NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running
|
||||
migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running
|
||||
|
||||
# this new postgres instance will have all the data from 'main' postgres,
|
||||
# but all modifications would not affect data in original postgres
|
||||
@@ -115,18 +163,26 @@ postgres=# select * from t;
|
||||
|
||||
postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
|
||||
# check that the new change doesn't affect the 'main' postgres
|
||||
> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
```
|
||||
|
||||
6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
|
||||
4. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
|
||||
you have just started. You can stop them all with one command:
|
||||
```sh
|
||||
> ./target/debug/zenith stop
|
||||
> ./target/debug/neon_local stop
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
```sh
|
||||
git clone --recursive https://github.com/zenithdb/zenith.git
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
make # builds also postgres and installs it to ./tmp_install
|
||||
./scripts/pytest
|
||||
```
|
||||
@@ -141,14 +197,14 @@ To view your `rustdoc` documentation in a browser, try running `cargo doc --no-d
|
||||
|
||||
### Postgres-specific terms
|
||||
|
||||
Due to Zenith's very close relation with PostgreSQL internals, there are numerous specific terms used.
|
||||
Due to Neon's very close relation with PostgreSQL internals, there are numerous specific terms used.
|
||||
Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use.
|
||||
|
||||
To get more familiar with this aspect, refer to:
|
||||
|
||||
- [Zenith glossary](/docs/glossary.md)
|
||||
- [Neon glossary](/docs/glossary.md)
|
||||
- [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html)
|
||||
- Other PostgreSQL documentation and sources (Zenith fork sources can be found [here](https://github.com/zenithdb/postgres))
|
||||
- Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres))
|
||||
|
||||
## Join the development
|
||||
|
||||
|
||||
@@ -11,9 +11,11 @@ clap = "3.0"
|
||||
env_logger = "0.9"
|
||||
hyper = { version = "0.14", features = ["full"] }
|
||||
log = { version = "0.4", features = ["std", "serde"] }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
regex = "1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
tar = "0.4"
|
||||
tokio = { version = "1", features = ["macros", "rt", "rt-multi-thread"] }
|
||||
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
# Compute node tools
|
||||
|
||||
Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
|
||||
`ExecStart` option. It will handle all the `zenith` specifics during compute node
|
||||
Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
|
||||
`ExecStart` option. It will handle all the `Neon` specifics during compute node
|
||||
initialization:
|
||||
- `zenith_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
- `compute_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
- Every start is a fresh start, so the data directory is removed and
|
||||
initialized again on each run.
|
||||
- Next it will put configuration files into the `PGDATA` directory.
|
||||
@@ -13,18 +13,18 @@ initialization:
|
||||
- Check and alter/drop/create roles and databases.
|
||||
- Hang waiting on the `postmaster` process to exit.
|
||||
|
||||
Also `zenith_ctl` spawns two separate service threads:
|
||||
Also `compute_ctl` spawns two separate service threads:
|
||||
- `compute-monitor` checks the last Postgres activity timestamp and saves it
|
||||
into the shared `ComputeState`;
|
||||
into the shared `ComputeNode`;
|
||||
- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
last activity requests.
|
||||
|
||||
Usage example:
|
||||
```sh
|
||||
zenith_ctl -D /var/db/postgres/compute \
|
||||
-C 'postgresql://zenith_admin@localhost/postgres' \
|
||||
-S /var/db/postgres/specs/current.json \
|
||||
-b /usr/local/bin/postgres
|
||||
compute_ctl -D /var/db/postgres/compute \
|
||||
-C 'postgresql://zenith_admin@localhost/postgres' \
|
||||
-S /var/db/postgres/specs/current.json \
|
||||
-b /usr/local/bin/postgres
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
174
compute_tools/src/bin/compute_ctl.rs
Normal file
174
compute_tools/src/bin/compute_ctl.rs
Normal file
@@ -0,0 +1,174 @@
|
||||
//!
|
||||
//! Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
|
||||
//! `ExecStart` option. It will handle all the `Neon` specifics during compute node
|
||||
//! initialization:
|
||||
//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
//! - Every start is a fresh start, so the data directory is removed and
|
||||
//! initialized again on each run.
|
||||
//! - Next it will put configuration files into the `PGDATA` directory.
|
||||
//! - Sync safekeepers and get commit LSN.
|
||||
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
|
||||
//! - Try to start `postgres` and wait until it is ready to accept connections.
|
||||
//! - Check and alter/drop/create roles and databases.
|
||||
//! - Hang waiting on the `postmaster` process to exit.
|
||||
//!
|
||||
//! Also `compute_ctl` spawns two separate service threads:
|
||||
//! - `compute-monitor` checks the last Postgres activity timestamp and saves it
|
||||
//! into the shared `ComputeNode`;
|
||||
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
//! last activity requests.
|
||||
//!
|
||||
//! Usage example:
|
||||
//! ```sh
|
||||
//! compute_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://zenith_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres
|
||||
//! ```
|
||||
//!
|
||||
use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use log::{error, info};
|
||||
|
||||
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::pg_helpers::*;
|
||||
use compute_tools::spec::*;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// TODO: re-use `utils::logging` later
|
||||
init_logger(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
// Env variable is set by `cargo`
|
||||
let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
|
||||
let matches = clap::App::new("compute_ctl")
|
||||
.version(version.unwrap_or("unknown"))
|
||||
.arg(
|
||||
Arg::new("connstr")
|
||||
.short('C')
|
||||
.long("connstr")
|
||||
.value_name("DATABASE_URL")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgdata")
|
||||
.short('D')
|
||||
.long("pgdata")
|
||||
.value_name("DATADIR")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbin")
|
||||
.short('b')
|
||||
.long("pgbin")
|
||||
.value_name("POSTGRES_PATH"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec")
|
||||
.short('s')
|
||||
.long("spec")
|
||||
.value_name("SPEC_JSON"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec-path")
|
||||
.short('S')
|
||||
.long("spec-path")
|
||||
.value_name("SPEC_PATH"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
|
||||
let connstr = matches
|
||||
.value_of("connstr")
|
||||
.expect("Postgres connection string is required");
|
||||
let spec = matches.value_of("spec");
|
||||
let spec_path = matches.value_of("spec-path");
|
||||
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
|
||||
|
||||
let spec: ComputeSpec = match spec {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
Some(json) => serde_json::from_str(json)?,
|
||||
None => {
|
||||
// Second, try to read it from the file if path is provided
|
||||
if let Some(sp) = spec_path {
|
||||
let path = Path::new(sp);
|
||||
let file = File::open(path)?;
|
||||
serde_json::from_reader(file)?
|
||||
} else {
|
||||
panic!("cluster spec should be provided via --spec or --spec-path argument");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let pageserver_connstr = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.page_server_connstring")
|
||||
.expect("pageserver connstr should be provided");
|
||||
let tenant = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.zenith_tenant")
|
||||
.expect("tenant id should be provided");
|
||||
let timeline = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.zenith_timeline")
|
||||
.expect("tenant id should be provided");
|
||||
|
||||
let compute_state = ComputeNode {
|
||||
start_time: Utc::now(),
|
||||
connstr: connstr.to_string(),
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
spec,
|
||||
tenant,
|
||||
timeline,
|
||||
pageserver_connstr,
|
||||
metrics: ComputeMetrics::new(),
|
||||
state: RwLock::new(ComputeState::new()),
|
||||
};
|
||||
let compute = Arc::new(compute_state);
|
||||
|
||||
// Launch service threads first, so we were able to serve availability
|
||||
// requests, while configuration is still in progress.
|
||||
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||
|
||||
// Run compute (Postgres) and hang waiting on it.
|
||||
match compute.prepare_and_run() {
|
||||
Ok(ec) => {
|
||||
let code = ec.code().unwrap_or(1);
|
||||
info!("Postgres exited with code {}, shutting down", code);
|
||||
exit(code)
|
||||
}
|
||||
Err(error) => {
|
||||
error!("could not start the compute node: {}", error);
|
||||
|
||||
let mut state = compute.state.write().unwrap();
|
||||
state.error = Some(format!("{:?}", error));
|
||||
state.status = ComputeStatus::Failed;
|
||||
drop(state);
|
||||
|
||||
// Keep serving HTTP requests, so the cloud control plane was able to
|
||||
// get the actual error.
|
||||
info!("giving control plane 30s to collect the error before shutdown");
|
||||
thread::sleep(Duration::from_secs(30));
|
||||
info!("shutting down");
|
||||
Err(error)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,249 +0,0 @@
|
||||
//!
|
||||
//! Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
|
||||
//! `ExecStart` option. It will handle all the `zenith` specifics during compute node
|
||||
//! initialization:
|
||||
//! - `zenith_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
//! - Every start is a fresh start, so the data directory is removed and
|
||||
//! initialized again on each run.
|
||||
//! - Next it will put configuration files into the `PGDATA` directory.
|
||||
//! - Sync safekeepers and get commit LSN.
|
||||
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
|
||||
//! - Try to start `postgres` and wait until it is ready to accept connections.
|
||||
//! - Check and alter/drop/create roles and databases.
|
||||
//! - Hang waiting on the `postmaster` process to exit.
|
||||
//!
|
||||
//! Also `zenith_ctl` spawns two separate service threads:
|
||||
//! - `compute-monitor` checks the last Postgres activity timestamp and saves it
|
||||
//! into the shared `ComputeState`;
|
||||
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
//! last activity requests.
|
||||
//!
|
||||
//! Usage example:
|
||||
//! ```sh
|
||||
//! zenith_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://zenith_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres
|
||||
//! ```
|
||||
//!
|
||||
use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::{exit, Command, ExitStatus};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use log::info;
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use compute_tools::config;
|
||||
use compute_tools::http_api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::pg_helpers::*;
|
||||
use compute_tools::spec::*;
|
||||
use compute_tools::zenith::*;
|
||||
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
fn prepare_pgdata(state: &Arc<RwLock<ComputeState>>) -> Result<()> {
|
||||
let state = state.read().unwrap();
|
||||
let spec = &state.spec;
|
||||
let pgdata_path = Path::new(&state.pgdata);
|
||||
let pageserver_connstr = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.page_server_connstring")
|
||||
.expect("pageserver connstr should be provided");
|
||||
let tenant = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.zenith_tenant")
|
||||
.expect("tenant id should be provided");
|
||||
let timeline = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.zenith_timeline")
|
||||
.expect("tenant id should be provided");
|
||||
|
||||
info!(
|
||||
"starting cluster #{}, operation #{}",
|
||||
spec.cluster.cluster_id,
|
||||
spec.operation_uuid.as_ref().unwrap()
|
||||
);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
create_pgdata(&state.pgdata)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
|
||||
info!("starting safekeepers syncing");
|
||||
let lsn = sync_safekeepers(&state.pgdata, &state.pgbin)
|
||||
.with_context(|| "failed to sync safekeepers")?;
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
|
||||
info!(
|
||||
"getting basebackup@{} from pageserver {}",
|
||||
lsn, pageserver_connstr
|
||||
);
|
||||
get_basebackup(&state.pgdata, &pageserver_connstr, &tenant, &timeline, &lsn).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"failed to get basebackup@{} from pageserver {}",
|
||||
lsn, pageserver_connstr
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
// Update pg_hba.conf received with basebackup.
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start Postgres as a child process and manage DBs/roles.
|
||||
/// After that this will hang waiting on the postmaster process to exit.
|
||||
fn run_compute(state: &Arc<RwLock<ComputeState>>) -> Result<ExitStatus> {
|
||||
let read_state = state.read().unwrap();
|
||||
let pgdata_path = Path::new(&read_state.pgdata);
|
||||
|
||||
// Run postgres as a child process.
|
||||
let mut pg = Command::new(&read_state.pgbin)
|
||||
.args(&["-D", &read_state.pgdata])
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
|
||||
// Try default Postgres port if it is not provided
|
||||
let port = read_state
|
||||
.spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("port")
|
||||
.unwrap_or_else(|| "5432".to_string());
|
||||
wait_for_postgres(&port, pgdata_path)?;
|
||||
|
||||
let mut client = Client::connect(&read_state.connstr, NoTls)?;
|
||||
|
||||
handle_roles(&read_state.spec, &mut client)?;
|
||||
handle_databases(&read_state.spec, &mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
info!(
|
||||
"finished configuration of cluster #{}",
|
||||
read_state.spec.cluster.cluster_id
|
||||
);
|
||||
|
||||
// Release the read lock.
|
||||
drop(read_state);
|
||||
|
||||
// Get the write lock, update state and release the lock, so HTTP API
|
||||
// was able to serve requests, while we are blocked waiting on
|
||||
// Postgres.
|
||||
let mut state = state.write().unwrap();
|
||||
state.ready = true;
|
||||
drop(state);
|
||||
|
||||
// Wait for child postgres process basically forever. In this state Ctrl+C
|
||||
// will be propagated to postgres and it will be shut down as well.
|
||||
let ecode = pg.wait().expect("failed to wait on postgres");
|
||||
|
||||
Ok(ecode)
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// TODO: re-use `zenith_utils::logging` later
|
||||
init_logger(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
// Env variable is set by `cargo`
|
||||
let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
|
||||
let matches = clap::App::new("zenith_ctl")
|
||||
.version(version.unwrap_or("unknown"))
|
||||
.arg(
|
||||
Arg::new("connstr")
|
||||
.short('C')
|
||||
.long("connstr")
|
||||
.value_name("DATABASE_URL")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgdata")
|
||||
.short('D')
|
||||
.long("pgdata")
|
||||
.value_name("DATADIR")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbin")
|
||||
.short('b')
|
||||
.long("pgbin")
|
||||
.value_name("POSTGRES_PATH"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec")
|
||||
.short('s')
|
||||
.long("spec")
|
||||
.value_name("SPEC_JSON"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec-path")
|
||||
.short('S')
|
||||
.long("spec-path")
|
||||
.value_name("SPEC_PATH"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
|
||||
let connstr = matches
|
||||
.value_of("connstr")
|
||||
.expect("Postgres connection string is required");
|
||||
let spec = matches.value_of("spec");
|
||||
let spec_path = matches.value_of("spec-path");
|
||||
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
|
||||
|
||||
let spec: ClusterSpec = match spec {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
Some(json) => serde_json::from_str(json)?,
|
||||
None => {
|
||||
// Second, try to read it from the file if path is provided
|
||||
if let Some(sp) = spec_path {
|
||||
let path = Path::new(sp);
|
||||
let file = File::open(path)?;
|
||||
serde_json::from_reader(file)?
|
||||
} else {
|
||||
panic!("cluster spec should be provided via --spec or --spec-path argument");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let compute_state = ComputeState {
|
||||
connstr: connstr.to_string(),
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
spec,
|
||||
ready: false,
|
||||
last_active: Utc::now(),
|
||||
};
|
||||
let compute_state = Arc::new(RwLock::new(compute_state));
|
||||
|
||||
// Launch service threads first, so we were able to serve availability
|
||||
// requests, while configuration is still in progress.
|
||||
let mut _threads = vec![
|
||||
launch_http_server(&compute_state).expect("cannot launch compute monitor thread"),
|
||||
launch_monitor(&compute_state).expect("cannot launch http endpoint thread"),
|
||||
];
|
||||
|
||||
prepare_pgdata(&compute_state)?;
|
||||
|
||||
// Run compute (Postgres) and hang waiting on it. Panic if any error happens,
|
||||
// it will help us to trigger unwind and kill postmaster as well.
|
||||
match run_compute(&compute_state) {
|
||||
Ok(ec) => exit(ec.success() as i32),
|
||||
Err(error) => panic!("cannot start compute node, error: {}", error),
|
||||
}
|
||||
}
|
||||
46
compute_tools/src/checker.rs
Normal file
46
compute_tools/src/checker.rs
Normal file
@@ -0,0 +1,46 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use log::error;
|
||||
use postgres::Client;
|
||||
use tokio_postgres::NoTls;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
|
||||
let query = "
|
||||
CREATE TABLE IF NOT EXISTS health_check (
|
||||
id serial primary key,
|
||||
updated_at timestamptz default now()
|
||||
);
|
||||
INSERT INTO health_check VALUES (1, now())
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET updated_at = now();";
|
||||
let result = client.simple_query(query)?;
|
||||
if result.len() < 2 {
|
||||
return Err(anyhow::format_err!("executed {} queries", result.len()));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn check_writability(compute: &Arc<ComputeNode>) -> Result<()> {
|
||||
let connstr = &compute.connstr;
|
||||
let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?;
|
||||
if client.is_closed() {
|
||||
return Err(anyhow!("connection to postgres closed"));
|
||||
}
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
error!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
let result = client
|
||||
.simple_query("UPDATE health_check SET updated_at = now() WHERE id = 1;")
|
||||
.await?;
|
||||
|
||||
if result.len() != 1 {
|
||||
return Err(anyhow!("statement can't be executed"));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
315
compute_tools/src/compute.rs
Normal file
315
compute_tools/src/compute.rs
Normal file
@@ -0,0 +1,315 @@
|
||||
//
|
||||
// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
|
||||
// but there are several things that makes `PostgresNode` usage inconvenient in the
|
||||
// cloud:
|
||||
// - it inherits from `LocalEnv`, which contains **all-all** the information about
|
||||
// a complete service running
|
||||
// - it uses `PageServerNode` with information about http endpoint, which we do not
|
||||
// need in the cloud again
|
||||
// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
|
||||
//
|
||||
// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
|
||||
// attributes (not required for the cloud). Yet, it is still tempting to unify these
|
||||
// `PostgresNode` and `ComputeNode` and use one in both places.
|
||||
//
|
||||
// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
|
||||
//
|
||||
use std::fs;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, ExitStatus, Stdio};
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::RwLock;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use log::info;
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::{Serialize, Serializer};
|
||||
|
||||
use crate::checker::create_writablity_check_data;
|
||||
use crate::config;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
pub start_time: DateTime<Utc>,
|
||||
pub connstr: String,
|
||||
pub pgdata: String,
|
||||
pub pgbin: String,
|
||||
pub spec: ComputeSpec,
|
||||
pub tenant: String,
|
||||
pub timeline: String,
|
||||
pub pageserver_connstr: String,
|
||||
pub metrics: ComputeMetrics,
|
||||
/// Volatile part of the `ComputeNode` so should be used under `RwLock`
|
||||
/// to allow HTTP API server to serve status requests, while configuration
|
||||
/// is in progress.
|
||||
pub state: RwLock<ComputeState>,
|
||||
}
|
||||
|
||||
fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
x.to_rfc3339().serialize(s)
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct ComputeState {
|
||||
pub status: ComputeStatus,
|
||||
/// Timestamp of the last Postgres activity
|
||||
#[serde(serialize_with = "rfc3339_serialize")]
|
||||
pub last_active: DateTime<Utc>,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
impl ComputeState {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
status: ComputeStatus::Init,
|
||||
last_active: Utc::now(),
|
||||
error: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ComputeState {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeStatus {
|
||||
Init,
|
||||
Running,
|
||||
Failed,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct ComputeMetrics {
|
||||
pub sync_safekeepers_ms: AtomicU64,
|
||||
pub basebackup_ms: AtomicU64,
|
||||
pub config_ms: AtomicU64,
|
||||
pub total_startup_ms: AtomicU64,
|
||||
}
|
||||
|
||||
impl ComputeMetrics {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
sync_safekeepers_ms: AtomicU64::new(0),
|
||||
basebackup_ms: AtomicU64::new(0),
|
||||
config_ms: AtomicU64::new(0),
|
||||
total_startup_ms: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ComputeMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
pub fn set_status(&self, status: ComputeStatus) {
|
||||
self.state.write().unwrap().status = status;
|
||||
}
|
||||
|
||||
pub fn get_status(&self) -> ComputeStatus {
|
||||
self.state.read().unwrap().status
|
||||
}
|
||||
|
||||
// Remove `pgdata` directory and create it again with right permissions.
|
||||
fn create_pgdata(&self) -> Result<()> {
|
||||
// Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
|
||||
// If it is something different then create_dir() will error out anyway.
|
||||
let _ok = fs::remove_dir_all(&self.pgdata);
|
||||
fs::create_dir(&self.pgdata)?;
|
||||
fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Get basebackup from the libpq connection to pageserver using `connstr` and
|
||||
// unarchive it to `pgdata` directory overriding all its previous content.
|
||||
fn get_basebackup(&self, lsn: &str) -> Result<()> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let mut client = Client::connect(&self.pageserver_connstr, NoTls)?;
|
||||
let basebackup_cmd = match lsn {
|
||||
"0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
|
||||
_ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
|
||||
};
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
let mut ar = tar::Archive::new(copyreader);
|
||||
|
||||
ar.unpack(&self.pgdata)?;
|
||||
|
||||
self.metrics.basebackup_ms.store(
|
||||
Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
||||
// and return the reported LSN back to the caller.
|
||||
fn sync_safekeepers(&self) -> Result<String> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let sync_handle = Command::new(&self.pgbin)
|
||||
.args(&["--sync-safekeepers"])
|
||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
|
||||
// `postgres --sync-safekeepers` will print all log output to stderr and
|
||||
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
||||
// redirected to the caller output.
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"postgres --sync-safekeepers exited with non-zero status: {}",
|
||||
sync_output.status,
|
||||
);
|
||||
}
|
||||
|
||||
self.metrics.sync_safekeepers_ms.store(
|
||||
Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
|
||||
let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
|
||||
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
pub fn prepare_pgdata(&self) -> Result<()> {
|
||||
let spec = &self.spec;
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
|
||||
info!("starting safekeepers syncing");
|
||||
let lsn = self
|
||||
.sync_safekeepers()
|
||||
.with_context(|| "failed to sync safekeepers")?;
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
|
||||
info!(
|
||||
"getting basebackup@{} from pageserver {}",
|
||||
lsn, &self.pageserver_connstr
|
||||
);
|
||||
self.get_basebackup(&lsn).with_context(|| {
|
||||
format!(
|
||||
"failed to get basebackup@{} from pageserver {}",
|
||||
lsn, &self.pageserver_connstr
|
||||
)
|
||||
})?;
|
||||
|
||||
// Update pg_hba.conf received with basebackup.
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start Postgres as a child process and manage DBs/roles.
|
||||
/// After that this will hang waiting on the postmaster process to exit.
|
||||
pub fn run(&self) -> Result<ExitStatus> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Run postgres as a child process.
|
||||
let mut pg = Command::new(&self.pgbin)
|
||||
.args(&["-D", &self.pgdata])
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
|
||||
// Try default Postgres port if it is not provided
|
||||
let port = self
|
||||
.spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("port")
|
||||
.unwrap_or_else(|| "5432".to_string());
|
||||
wait_for_postgres(&mut pg, &port, pgdata_path)?;
|
||||
|
||||
let mut client = Client::connect(&self.connstr, NoTls)?;
|
||||
|
||||
handle_roles(&self.spec, &mut client)?;
|
||||
handle_databases(&self.spec, &mut client)?;
|
||||
handle_grants(&self.spec, &mut client)?;
|
||||
create_writablity_check_data(&mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
let startup_end_time = Utc::now();
|
||||
|
||||
self.metrics.config_ms.store(
|
||||
startup_end_time
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
self.metrics.total_startup_ms.store(
|
||||
startup_end_time
|
||||
.signed_duration_since(self.start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
|
||||
self.set_status(ComputeStatus::Running);
|
||||
|
||||
info!(
|
||||
"finished configuration of compute for project {}",
|
||||
self.spec.cluster.cluster_id
|
||||
);
|
||||
|
||||
// Wait for child Postgres process basically forever. In this state Ctrl+C
|
||||
// will propagate to Postgres and it will be shut down as well.
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
|
||||
Ok(ecode)
|
||||
}
|
||||
|
||||
pub fn prepare_and_run(&self) -> Result<ExitStatus> {
|
||||
info!(
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}",
|
||||
self.spec.cluster.cluster_id,
|
||||
self.spec.operation_uuid.as_ref().unwrap(),
|
||||
self.tenant,
|
||||
self.timeline,
|
||||
);
|
||||
|
||||
self.prepare_pgdata()?;
|
||||
self.run()
|
||||
}
|
||||
}
|
||||
@@ -6,7 +6,7 @@ use std::path::Path;
|
||||
use anyhow::Result;
|
||||
|
||||
use crate::pg_helpers::PgOptionsSerialize;
|
||||
use crate::zenith::ClusterSpec;
|
||||
use crate::spec::ComputeSpec;
|
||||
|
||||
/// Check that `line` is inside a text file and put it there if it is not.
|
||||
/// Create file if it doesn't exist.
|
||||
@@ -32,20 +32,20 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
}
|
||||
|
||||
/// Create or completely rewrite configuration file specified by `path`
|
||||
pub fn write_postgres_conf(path: &Path, spec: &ClusterSpec) -> Result<()> {
|
||||
pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
|
||||
// File::create() destroys the file content if it exists.
|
||||
let mut postgres_conf = File::create(path)?;
|
||||
|
||||
write_zenith_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
|
||||
write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Write Postgres config block wrapped with generated comment section
|
||||
fn write_zenith_managed_block(file: &mut File, buf: &str) -> Result<()> {
|
||||
writeln!(file, "# Managed by Zenith: begin")?;
|
||||
fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> {
|
||||
writeln!(file, "# Managed by compute_ctl: begin")?;
|
||||
writeln!(file, "{}", buf)?;
|
||||
writeln!(file, "# Managed by Zenith: end")?;
|
||||
writeln!(file, "# Managed by compute_ctl: end")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
109
compute_tools/src/http/api.rs
Normal file
109
compute_tools/src/http/api.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use std::convert::Infallible;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use anyhow::Result;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use log::{error, info};
|
||||
use serde_json;
|
||||
|
||||
use crate::compute::{ComputeNode, ComputeStatus};
|
||||
|
||||
// Service function to handle all available routes.
|
||||
async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
|
||||
match (req.method(), req.uri().path()) {
|
||||
// Timestamp of the last Postgres activity in the plain text.
|
||||
// DEPRECATED in favour of /status
|
||||
(&Method::GET, "/last_activity") => {
|
||||
info!("serving /last_active GET request");
|
||||
let state = compute.state.read().unwrap();
|
||||
|
||||
// Use RFC3339 format for consistency.
|
||||
Response::new(Body::from(state.last_active.to_rfc3339()))
|
||||
}
|
||||
|
||||
// Has compute setup process finished? -> true/false.
|
||||
// DEPRECATED in favour of /status
|
||||
(&Method::GET, "/ready") => {
|
||||
info!("serving /ready GET request");
|
||||
let status = compute.get_status();
|
||||
Response::new(Body::from(format!("{}", status == ComputeStatus::Running)))
|
||||
}
|
||||
|
||||
// Serialized compute state.
|
||||
(&Method::GET, "/status") => {
|
||||
info!("serving /status GET request");
|
||||
let state = compute.state.read().unwrap();
|
||||
Response::new(Body::from(serde_json::to_string(&*state).unwrap()))
|
||||
}
|
||||
|
||||
// Startup metrics in JSON format. Keep /metrics reserved for a possible
|
||||
// future use for Prometheus metrics format.
|
||||
(&Method::GET, "/metrics.json") => {
|
||||
info!("serving /metrics.json GET request");
|
||||
Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
|
||||
}
|
||||
|
||||
// DEPRECATED, use POST instead
|
||||
(&Method::GET, "/check_writability") => {
|
||||
info!("serving /check_writability GET request");
|
||||
let res = crate::checker::check_writability(&compute).await;
|
||||
match res {
|
||||
Ok(_) => Response::new(Body::from("true")),
|
||||
Err(e) => Response::new(Body::from(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::POST, "/check_writability") => {
|
||||
info!("serving /check_writability POST request");
|
||||
let res = crate::checker::check_writability(&compute).await;
|
||||
match res {
|
||||
Ok(_) => Response::new(Body::from("true")),
|
||||
Err(e) => Response::new(Body::from(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
_ => {
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
*not_found.status_mut() = StatusCode::NOT_FOUND;
|
||||
not_found
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(state: Arc<ComputeNode>) {
|
||||
let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
|
||||
|
||||
let make_service = make_service_fn(move |_conn| {
|
||||
let state = state.clone();
|
||||
async move {
|
||||
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
|
||||
let state = state.clone();
|
||||
async move { Ok::<_, Infallible>(routes(req, state).await) }
|
||||
}))
|
||||
}
|
||||
});
|
||||
|
||||
info!("starting HTTP server on {}", addr);
|
||||
|
||||
let server = Server::bind(&addr).serve(make_service);
|
||||
|
||||
// Run this server forever
|
||||
if let Err(e) = server.await {
|
||||
error!("server error: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
|
||||
pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
.name("http-endpoint".into())
|
||||
.spawn(move || serve(state))?)
|
||||
}
|
||||
1
compute_tools/src/http/mod.rs
Normal file
1
compute_tools/src/http/mod.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod api;
|
||||
158
compute_tools/src/http/openapi_spec.yaml
Normal file
158
compute_tools/src/http/openapi_spec.yaml
Normal file
@@ -0,0 +1,158 @@
|
||||
openapi: "3.0.2"
|
||||
info:
|
||||
title: Compute node control API
|
||||
version: "1.0"
|
||||
|
||||
servers:
|
||||
- url: "http://localhost:3080"
|
||||
|
||||
paths:
|
||||
/status:
|
||||
get:
|
||||
tags:
|
||||
- "info"
|
||||
summary: Get compute node internal status
|
||||
description: ""
|
||||
operationId: getComputeStatus
|
||||
responses:
|
||||
"200":
|
||||
description: ComputeState
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeState"
|
||||
|
||||
/metrics.json:
|
||||
get:
|
||||
tags:
|
||||
- "info"
|
||||
summary: Get compute node startup metrics in JSON format
|
||||
description: ""
|
||||
operationId: getComputeMetricsJSON
|
||||
responses:
|
||||
"200":
|
||||
description: ComputeMetrics
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeMetrics"
|
||||
|
||||
/ready:
|
||||
get:
|
||||
deprecated: true
|
||||
tags:
|
||||
- "info"
|
||||
summary: Check whether compute startup process finished successfully
|
||||
description: ""
|
||||
operationId: computeIsReady
|
||||
responses:
|
||||
"200":
|
||||
description: Compute is ready ('true') or not ('false')
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
example: "true"
|
||||
|
||||
/last_activity:
|
||||
get:
|
||||
deprecated: true
|
||||
tags:
|
||||
- "info"
|
||||
summary: Get timestamp of the last compute activity
|
||||
description: ""
|
||||
operationId: getLastComputeActivityTS
|
||||
responses:
|
||||
"200":
|
||||
description: Timestamp of the last compute activity
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
example: "2022-10-12T07:20:50.52Z"
|
||||
|
||||
/check_writability:
|
||||
get:
|
||||
deprecated: true
|
||||
tags:
|
||||
- "check"
|
||||
summary: Check that we can write new data on this compute
|
||||
description: ""
|
||||
operationId: checkComputeWritabilityDeprecated
|
||||
responses:
|
||||
"200":
|
||||
description: Check result
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Error text or 'true' if check passed
|
||||
example: "true"
|
||||
|
||||
post:
|
||||
tags:
|
||||
- "check"
|
||||
summary: Check that we can write new data on this compute
|
||||
description: ""
|
||||
operationId: checkComputeWritability
|
||||
responses:
|
||||
"200":
|
||||
description: Check result
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Error text or 'true' if check passed
|
||||
example: "true"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
type: http
|
||||
scheme: bearer
|
||||
bearerFormat: JWT
|
||||
|
||||
schemas:
|
||||
ComputeMetrics:
|
||||
type: object
|
||||
description: Compute startup metrics
|
||||
required:
|
||||
- sync_safekeepers_ms
|
||||
- basebackup_ms
|
||||
- config_ms
|
||||
- total_startup_ms
|
||||
properties:
|
||||
sync_safekeepers_ms:
|
||||
type: integer
|
||||
basebackup_ms:
|
||||
type: integer
|
||||
config_ms:
|
||||
type: integer
|
||||
total_startup_ms:
|
||||
type: integer
|
||||
|
||||
ComputeState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
- last_active
|
||||
properties:
|
||||
status:
|
||||
$ref: '#/components/schemas/ComputeStatus'
|
||||
last_active:
|
||||
type: string
|
||||
description: The last detected compute activity timestamp in UTC and RFC3339 format
|
||||
example: "2022-10-12T07:20:50.52Z"
|
||||
error:
|
||||
type: string
|
||||
description: Text of the error during compute startup, if any
|
||||
|
||||
ComputeStatus:
|
||||
type: string
|
||||
enum:
|
||||
- init
|
||||
- failed
|
||||
- running
|
||||
|
||||
security:
|
||||
- JWT: []
|
||||
@@ -1,73 +0,0 @@
|
||||
use std::convert::Infallible;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
|
||||
use anyhow::Result;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use log::{error, info};
|
||||
|
||||
use crate::zenith::*;
|
||||
|
||||
// Service function to handle all available routes.
|
||||
fn routes(req: Request<Body>, state: Arc<RwLock<ComputeState>>) -> Response<Body> {
|
||||
match (req.method(), req.uri().path()) {
|
||||
// Timestamp of the last Postgres activity in the plain text.
|
||||
(&Method::GET, "/last_activity") => {
|
||||
info!("serving /last_active GET request");
|
||||
let state = state.read().unwrap();
|
||||
|
||||
// Use RFC3339 format for consistency.
|
||||
Response::new(Body::from(state.last_active.to_rfc3339()))
|
||||
}
|
||||
|
||||
// Has compute setup process finished? -> true/false
|
||||
(&Method::GET, "/ready") => {
|
||||
info!("serving /ready GET request");
|
||||
let state = state.read().unwrap();
|
||||
Response::new(Body::from(format!("{}", state.ready)))
|
||||
}
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
_ => {
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
*not_found.status_mut() = StatusCode::NOT_FOUND;
|
||||
not_found
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(state: Arc<RwLock<ComputeState>>) {
|
||||
let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
|
||||
|
||||
let make_service = make_service_fn(move |_conn| {
|
||||
let state = state.clone();
|
||||
async move {
|
||||
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
|
||||
let state = state.clone();
|
||||
async move { Ok::<_, Infallible>(routes(req, state)) }
|
||||
}))
|
||||
}
|
||||
});
|
||||
|
||||
info!("starting HTTP server on {}", addr);
|
||||
|
||||
let server = Server::bind(&addr).serve(make_service);
|
||||
|
||||
// Run this server forever
|
||||
if let Err(e) = server.await {
|
||||
error!("server error: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
|
||||
pub fn launch_http_server(state: &Arc<RwLock<ComputeState>>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
.name("http-endpoint".into())
|
||||
.spawn(move || serve(state))?)
|
||||
}
|
||||
@@ -2,12 +2,13 @@
|
||||
//! Various tools and helpers to handle cluster / compute node (Postgres)
|
||||
//! configuration.
|
||||
//!
|
||||
pub mod checker;
|
||||
pub mod config;
|
||||
pub mod http_api;
|
||||
pub mod http;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod compute;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
pub mod spec;
|
||||
pub mod zenith;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::sync::Arc;
|
||||
use std::{thread, time};
|
||||
|
||||
use anyhow::Result;
|
||||
@@ -6,16 +6,16 @@ use chrono::{DateTime, Utc};
|
||||
use log::{debug, info};
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::zenith::ComputeState;
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
|
||||
|
||||
// Spin in a loop and figure out the last activity time in the Postgres.
|
||||
// Then update it in the shared state. This function never errors out.
|
||||
// XXX: the only expected panic is at `RwLock` unwrap().
|
||||
fn watch_compute_activity(state: &Arc<RwLock<ComputeState>>) {
|
||||
fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = state.read().unwrap().connstr.clone();
|
||||
let connstr = compute.connstr.clone();
|
||||
// Define `client` outside of the loop to reuse existing connection if it's active.
|
||||
let mut client = Client::connect(&connstr, NoTls);
|
||||
let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
|
||||
@@ -46,7 +46,7 @@ fn watch_compute_activity(state: &Arc<RwLock<ComputeState>>) {
|
||||
AND usename != 'zenith_admin';", // XXX: find a better way to filter other monitors?
|
||||
&[],
|
||||
);
|
||||
let mut last_active = state.read().unwrap().last_active;
|
||||
let mut last_active = compute.state.read().unwrap().last_active;
|
||||
|
||||
if let Ok(backs) = backends {
|
||||
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
|
||||
@@ -83,14 +83,14 @@ fn watch_compute_activity(state: &Arc<RwLock<ComputeState>>) {
|
||||
}
|
||||
|
||||
// Update the last activity in the shared state if we got a more recent one.
|
||||
let mut state = state.write().unwrap();
|
||||
let mut state = compute.state.write().unwrap();
|
||||
if last_active > state.last_active {
|
||||
state.last_active = last_active;
|
||||
debug!("set the last compute activity time to: {}", last_active);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
info!("cannot connect to postgres: {}, retrying", e);
|
||||
debug!("cannot connect to postgres: {}, retrying", e);
|
||||
|
||||
// Establish a new connection and try again.
|
||||
client = Client::connect(&connstr, NoTls);
|
||||
@@ -100,7 +100,7 @@ fn watch_compute_activity(state: &Arc<RwLock<ComputeState>>) {
|
||||
}
|
||||
|
||||
/// Launch a separate compute monitor thread and return its `JoinHandle`.
|
||||
pub fn launch_monitor(state: &Arc<RwLock<ComputeState>>) -> Result<thread::JoinHandle<()>> {
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::net::{SocketAddr, TcpStream};
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
use std::process::Child;
|
||||
use std::str::FromStr;
|
||||
use std::{fs, thread, time};
|
||||
|
||||
@@ -132,7 +134,14 @@ impl Role {
|
||||
let mut params: String = "LOGIN".to_string();
|
||||
|
||||
if let Some(pass) = &self.encrypted_password {
|
||||
params.push_str(&format!(" PASSWORD 'md5{}'", pass));
|
||||
// Some time ago we supported only md5 and treated all encrypted_password as md5.
|
||||
// Now we also support SCRAM-SHA-256 and to preserve compatibility
|
||||
// we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
|
||||
if pass.starts_with("SCRAM-SHA-256") {
|
||||
params.push_str(&format!(" PASSWORD '{}'", pass));
|
||||
} else {
|
||||
params.push_str(&format!(" PASSWORD 'md5{}'", pass));
|
||||
}
|
||||
} else {
|
||||
params.push_str(" PASSWORD NULL");
|
||||
}
|
||||
@@ -213,12 +222,12 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
|
||||
/// Wait for Postgres to become ready to accept connections:
|
||||
/// - state should be `ready` in the `pgdata/postmaster.pid`
|
||||
/// - and we should be able to connect to 127.0.0.1:5432
|
||||
pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> {
|
||||
pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> {
|
||||
let pid_path = pgdata.join("postmaster.pid");
|
||||
let mut slept: u64 = 0; // ms
|
||||
let pause = time::Duration::from_millis(100);
|
||||
|
||||
let timeout = time::Duration::from_millis(200);
|
||||
let timeout = time::Duration::from_millis(10);
|
||||
let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap();
|
||||
|
||||
loop {
|
||||
@@ -229,14 +238,19 @@ pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> {
|
||||
bail!("timed out while waiting for Postgres to start");
|
||||
}
|
||||
|
||||
if let Ok(Some(status)) = pg.try_wait() {
|
||||
// Postgres exited, that is not what we expected, bail out earlier.
|
||||
let code = status.code().unwrap_or(-1);
|
||||
bail!("Postgres exited unexpectedly with code {}", code);
|
||||
}
|
||||
|
||||
if pid_path.exists() {
|
||||
// XXX: dumb and the simplest way to get the last line in a text file
|
||||
// TODO: better use `.lines().last()` later
|
||||
let stdout = Command::new("tail")
|
||||
.args(&["-n1", pid_path.to_str().unwrap()])
|
||||
.output()?
|
||||
.stdout;
|
||||
let status = String::from_utf8(stdout)?;
|
||||
let file = BufReader::new(File::open(&pid_path)?);
|
||||
let status = file
|
||||
.lines()
|
||||
.last()
|
||||
.unwrap()
|
||||
.unwrap_or_else(|_| "unknown".to_string());
|
||||
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
|
||||
|
||||
// Now Postgres is ready to accept connections
|
||||
|
||||
@@ -3,16 +3,53 @@ use std::path::Path;
|
||||
use anyhow::Result;
|
||||
use log::{info, log_enabled, warn, Level};
|
||||
use postgres::Client;
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::config;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::zenith::ClusterSpec;
|
||||
|
||||
/// Cluster spec or configuration represented as an optional number of
|
||||
/// delta operations + final cluster state description.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct ComputeSpec {
|
||||
pub format_version: f32,
|
||||
pub timestamp: String,
|
||||
pub operation_uuid: Option<String>,
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
}
|
||||
|
||||
/// Cluster state seen from the perspective of the external tools
|
||||
/// like Rails web console.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct Cluster {
|
||||
pub cluster_id: String,
|
||||
pub name: String,
|
||||
pub state: Option<String>,
|
||||
pub roles: Vec<Role>,
|
||||
pub databases: Vec<Database>,
|
||||
pub settings: GenericOptions,
|
||||
}
|
||||
|
||||
/// Single cluster state changing operation that could not be represented as
|
||||
/// a static `Cluster` structure. For example:
|
||||
/// - DROP DATABASE
|
||||
/// - DROP ROLE
|
||||
/// - ALTER ROLE name RENAME TO new_name
|
||||
/// - ALTER DATABASE name RENAME TO new_name
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct DeltaOp {
|
||||
pub action: String,
|
||||
pub name: PgIdent,
|
||||
pub new_name: Option<PgIdent>,
|
||||
}
|
||||
|
||||
/// It takes cluster specification and does the following:
|
||||
/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
|
||||
/// - Update `pg_hba.conf` to allow external connections.
|
||||
pub fn handle_configuration(spec: &ClusterSpec, pgdata_path: &Path) -> Result<()> {
|
||||
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
|
||||
// File `postgresql.conf` is no longer included into `basebackup`, so just
|
||||
// always write all config into it creating new file.
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
@@ -39,7 +76,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
|
||||
/// Given a cluster spec json and open transaction it handles roles creation,
|
||||
/// deletion and update.
|
||||
pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
|
||||
pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
let mut xact = client.transaction()?;
|
||||
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
|
||||
|
||||
@@ -136,13 +173,20 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
} else {
|
||||
info!("role name {}", &name);
|
||||
info!("role name: '{}'", &name);
|
||||
let mut query: String = format!("CREATE ROLE {} ", name.quote());
|
||||
info!("role create query {}", &query);
|
||||
info!("role create query: '{}'", &query);
|
||||
info_print!(" -> create");
|
||||
|
||||
query.push_str(&role.to_pg_options());
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
|
||||
let grant_query = format!(
|
||||
"grant pg_read_all_data, pg_write_all_data to {}",
|
||||
name.quote()
|
||||
);
|
||||
xact.execute(grant_query.as_str(), &[])?;
|
||||
info!("role grant query: '{}'", &grant_query);
|
||||
}
|
||||
|
||||
info_print!("\n");
|
||||
@@ -158,7 +202,7 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
|
||||
/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
|
||||
/// atomicity should be enough here due to the order of operations and various checks,
|
||||
/// which together provide us idempotency.
|
||||
pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
|
||||
pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
|
||||
|
||||
// Print a list of existing Postgres databases (only in debug mode)
|
||||
@@ -244,3 +288,24 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Grant CREATE ON DATABASE to the database owner
|
||||
// to allow clients create trusted extensions.
|
||||
pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
info!("cluster spec grants:");
|
||||
|
||||
for db in &spec.cluster.databases {
|
||||
let dbname = &db.name;
|
||||
|
||||
let query: String = format!(
|
||||
"GRANT CREATE ON DATABASE {} TO {}",
|
||||
dbname.quote(),
|
||||
db.owner.quote()
|
||||
);
|
||||
info!("grant query {}", &query);
|
||||
|
||||
client.execute(query.as_str(), &[])?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,109 +0,0 @@
|
||||
use std::process::{Command, Stdio};
|
||||
|
||||
use anyhow::Result;
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
/// Compute node state shared across several `zenith_ctl` threads.
|
||||
/// Should be used under `RwLock` to allow HTTP API server to serve
|
||||
/// status requests, while configuration is in progress.
|
||||
pub struct ComputeState {
|
||||
pub connstr: String,
|
||||
pub pgdata: String,
|
||||
pub pgbin: String,
|
||||
pub spec: ClusterSpec,
|
||||
/// Compute setup process has finished
|
||||
pub ready: bool,
|
||||
/// Timestamp of the last Postgres activity
|
||||
pub last_active: DateTime<Utc>,
|
||||
}
|
||||
|
||||
/// Cluster spec or configuration represented as an optional number of
|
||||
/// delta operations + final cluster state description.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct ClusterSpec {
|
||||
pub format_version: f32,
|
||||
pub timestamp: String,
|
||||
pub operation_uuid: Option<String>,
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
}
|
||||
|
||||
/// Cluster state seen from the perspective of the external tools
|
||||
/// like Rails web console.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct Cluster {
|
||||
pub cluster_id: String,
|
||||
pub name: String,
|
||||
pub state: Option<String>,
|
||||
pub roles: Vec<Role>,
|
||||
pub databases: Vec<Database>,
|
||||
pub settings: GenericOptions,
|
||||
}
|
||||
|
||||
/// Single cluster state changing operation that could not be represented as
|
||||
/// a static `Cluster` structure. For example:
|
||||
/// - DROP DATABASE
|
||||
/// - DROP ROLE
|
||||
/// - ALTER ROLE name RENAME TO new_name
|
||||
/// - ALTER DATABASE name RENAME TO new_name
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct DeltaOp {
|
||||
pub action: String,
|
||||
pub name: PgIdent,
|
||||
pub new_name: Option<PgIdent>,
|
||||
}
|
||||
|
||||
/// Get basebackup from the libpq connection to pageserver using `connstr` and
|
||||
/// unarchive it to `pgdata` directory overriding all its previous content.
|
||||
pub fn get_basebackup(
|
||||
pgdata: &str,
|
||||
connstr: &str,
|
||||
tenant: &str,
|
||||
timeline: &str,
|
||||
lsn: &str,
|
||||
) -> Result<()> {
|
||||
let mut client = Client::connect(connstr, NoTls)?;
|
||||
let basebackup_cmd = match lsn {
|
||||
"0/0" => format!("basebackup {} {}", tenant, timeline), // First start of the compute
|
||||
_ => format!("basebackup {} {} {}", tenant, timeline, lsn),
|
||||
};
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
let mut ar = tar::Archive::new(copyreader);
|
||||
|
||||
ar.unpack(&pgdata)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
||||
/// and return the reported LSN back to the caller.
|
||||
pub fn sync_safekeepers(pgdata: &str, pgbin: &str) -> Result<String> {
|
||||
let sync_handle = Command::new(&pgbin)
|
||||
.args(&["--sync-safekeepers"])
|
||||
.env("PGDATA", &pgdata) // we cannot use -D in this mode
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
|
||||
// `postgres --sync-safekeepers` will print all log output to stderr and
|
||||
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
||||
// redirected to the caller output.
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"postgres --sync-safekeepers exited with non-zero status: {}",
|
||||
sync_output.status,
|
||||
);
|
||||
}
|
||||
|
||||
let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
|
||||
|
||||
Ok(lsn)
|
||||
}
|
||||
@@ -4,12 +4,12 @@ mod pg_helpers_tests {
|
||||
use std::fs::File;
|
||||
|
||||
use compute_tools::pg_helpers::*;
|
||||
use compute_tools::zenith::ClusterSpec;
|
||||
use compute_tools::spec::ComputeSpec;
|
||||
|
||||
#[test]
|
||||
fn params_serialize() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let spec: ClusterSpec = serde_json::from_reader(file).unwrap();
|
||||
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
spec.cluster.databases.first().unwrap().to_pg_options(),
|
||||
@@ -24,7 +24,7 @@ mod pg_helpers_tests {
|
||||
#[test]
|
||||
fn settings_serialize() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let spec: ClusterSpec = serde_json::from_reader(file).unwrap();
|
||||
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
spec.cluster.settings.as_pg_settings(),
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
tar = "0.4.33"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
toml = "0.5"
|
||||
@@ -18,6 +18,6 @@ url = "2.2.2"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
safekeeper = { path = "../safekeeper" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -9,3 +9,6 @@ auth_type = 'Trust'
|
||||
id = 1
|
||||
pg_port = 5454
|
||||
http_port = 7676
|
||||
|
||||
[etcd_broker]
|
||||
broker_endpoints = ['http://127.0.0.1:2379']
|
||||
|
||||
@@ -11,11 +11,12 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
use utils::{
|
||||
connstring::connection_host_port,
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
@@ -272,12 +273,7 @@ impl PostgresNode {
|
||||
conf.append("wal_sender_timeout", "5s");
|
||||
conf.append("listen_addresses", &self.address.ip().to_string());
|
||||
conf.append("port", &self.address.port().to_string());
|
||||
|
||||
// Never clean up old WAL. TODO: We should use a replication
|
||||
// slot or something proper, to prevent the compute node
|
||||
// from removing WAL that hasn't been streamed to the safekeeper or
|
||||
// page server yet. (gh issue #349)
|
||||
conf.append("wal_keep_size", "10TB");
|
||||
conf.append("wal_keep_size", "0");
|
||||
|
||||
// Configure the node to fetch pages from pageserver
|
||||
let pageserver_connstr = {
|
||||
@@ -331,14 +327,14 @@ impl PostgresNode {
|
||||
// Configure the node to connect to the safekeepers
|
||||
conf.append("synchronous_standby_names", "walproposer");
|
||||
|
||||
let wal_acceptors = self
|
||||
let safekeepers = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|sk| format!("localhost:{}", sk.pg_port))
|
||||
.collect::<Vec<String>>()
|
||||
.join(",");
|
||||
conf.append("wal_acceptors", &wal_acceptors);
|
||||
conf.append("wal_acceptors", &safekeepers);
|
||||
} else {
|
||||
// We only use setup without safekeepers for tests,
|
||||
// and don't care about data durability on pageserver,
|
||||
@@ -420,10 +416,15 @@ impl PostgresNode {
|
||||
if let Some(token) = auth_token {
|
||||
cmd.env("ZENITH_AUTH_TOKEN", token);
|
||||
}
|
||||
let pg_ctl = cmd.status().context("pg_ctl failed")?;
|
||||
|
||||
if !pg_ctl.success() {
|
||||
anyhow::bail!("pg_ctl failed");
|
||||
let pg_ctl = cmd.output().context("pg_ctl failed")?;
|
||||
if !pg_ctl.status.success() {
|
||||
anyhow::bail!(
|
||||
"pg_ctl failed, exit code: {}, stdout: {}, stderr: {}",
|
||||
pg_ctl.status,
|
||||
String::from_utf8_lossy(&pg_ctl.stdout),
|
||||
String::from_utf8_lossy(&pg_ctl.stderr),
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
93
control_plane/src/etcd.rs
Normal file
93
control_plane/src/etcd.rs
Normal file
@@ -0,0 +1,93 @@
|
||||
use std::{
|
||||
fs,
|
||||
path::PathBuf,
|
||||
process::{Command, Stdio},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use nix::{
|
||||
sys::signal::{kill, Signal},
|
||||
unistd::Pid,
|
||||
};
|
||||
|
||||
use crate::{local_env, read_pidfile};
|
||||
|
||||
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_broker = &env.etcd_broker;
|
||||
println!(
|
||||
"Starting etcd broker using {}",
|
||||
etcd_broker.etcd_binary_path.display()
|
||||
);
|
||||
|
||||
let etcd_data_dir = env.base_data_dir.join("etcd");
|
||||
fs::create_dir_all(&etcd_data_dir).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd data dir: {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let etcd_stdout_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create ectd stout file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let etcd_stderr_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create ectd stderr file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let client_urls = etcd_broker.comma_separated_endpoints();
|
||||
|
||||
let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
|
||||
.args(&[
|
||||
format!("--data-dir={}", etcd_data_dir.display()),
|
||||
format!("--listen-client-urls={client_urls}"),
|
||||
format!("--advertise-client-urls={client_urls}"),
|
||||
])
|
||||
.stdout(Stdio::from(etcd_stdout_file))
|
||||
.stderr(Stdio::from(etcd_stderr_file))
|
||||
.spawn()
|
||||
.context("Failed to spawn etcd subprocess")?;
|
||||
let pid = etcd_process.id();
|
||||
|
||||
let etcd_pid_file_path = etcd_pid_file_path(env);
|
||||
fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd pid file at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_path = &env.etcd_broker.etcd_binary_path;
|
||||
println!("Stopping etcd broker at {}", etcd_path.display());
|
||||
|
||||
let etcd_pid_file_path = etcd_pid_file_path(env);
|
||||
let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read etcd pid filea at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?);
|
||||
|
||||
kill(pid, Signal::SIGTERM).with_context(|| {
|
||||
format!(
|
||||
"Failed to stop etcd with pid {pid} at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
|
||||
env.base_data_dir.join("etcd.pid")
|
||||
}
|
||||
@@ -12,6 +12,7 @@ use std::path::Path;
|
||||
use std::process::Command;
|
||||
|
||||
pub mod compute;
|
||||
pub mod etcd;
|
||||
pub mod local_env;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
//! script which will use local paths.
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
@@ -11,9 +12,11 @@ use std::env;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
use utils::{
|
||||
auth::{encode_from_key_file, Claims, Scope},
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
@@ -57,6 +60,8 @@ pub struct LocalEnv {
|
||||
#[serde(default)]
|
||||
pub private_key_path: PathBuf,
|
||||
|
||||
pub etcd_broker: EtcdBroker,
|
||||
|
||||
pub pageserver: PageServerConf,
|
||||
|
||||
#[serde(default)]
|
||||
@@ -71,6 +76,62 @@ pub struct LocalEnv {
|
||||
branch_name_mappings: HashMap<String, Vec<(ZTenantId, ZTimelineId)>>,
|
||||
}
|
||||
|
||||
/// Etcd broker config for cluster internal communication.
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
pub struct EtcdBroker {
|
||||
/// A prefix to all to any key when pushing/polling etcd from a node.
|
||||
#[serde(default)]
|
||||
pub broker_etcd_prefix: Option<String>,
|
||||
|
||||
/// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Vec<DisplayFromStr>")]
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
|
||||
/// Etcd binary path to use.
|
||||
#[serde(default)]
|
||||
pub etcd_binary_path: PathBuf,
|
||||
}
|
||||
|
||||
impl EtcdBroker {
|
||||
pub fn locate_etcd() -> anyhow::Result<PathBuf> {
|
||||
let which_output = Command::new("which")
|
||||
.arg("etcd")
|
||||
.output()
|
||||
.context("Failed to run 'which etcd' command")?;
|
||||
let stdout = String::from_utf8_lossy(&which_output.stdout);
|
||||
ensure!(
|
||||
which_output.status.success(),
|
||||
"'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}",
|
||||
which_output.status,
|
||||
String::from_utf8_lossy(&which_output.stderr)
|
||||
);
|
||||
|
||||
let etcd_path = PathBuf::from(stdout.trim());
|
||||
ensure!(
|
||||
etcd_path.is_file(),
|
||||
"'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}",
|
||||
etcd_path.display()
|
||||
);
|
||||
|
||||
Ok(etcd_path)
|
||||
}
|
||||
|
||||
pub fn comma_separated_endpoints(&self) -> String {
|
||||
self.broker_endpoints.iter().map(Url::as_str).fold(
|
||||
String::new(),
|
||||
|mut comma_separated_urls, url| {
|
||||
if !comma_separated_urls.is_empty() {
|
||||
comma_separated_urls.push(',');
|
||||
}
|
||||
comma_separated_urls.push_str(url);
|
||||
comma_separated_urls
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
pub struct PageServerConf {
|
||||
@@ -174,12 +235,7 @@ impl LocalEnv {
|
||||
if old_timeline_id == &timeline_id {
|
||||
Ok(())
|
||||
} else {
|
||||
bail!(
|
||||
"branch '{}' is already mapped to timeline {}, cannot map to another timeline {}",
|
||||
branch_name,
|
||||
old_timeline_id,
|
||||
timeline_id
|
||||
);
|
||||
bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
|
||||
}
|
||||
} else {
|
||||
existing_values.push((tenant_id, timeline_id));
|
||||
@@ -215,7 +271,7 @@ impl LocalEnv {
|
||||
///
|
||||
/// Unlike 'load_config', this function fills in any defaults that are missing
|
||||
/// from the config file.
|
||||
pub fn create_config(toml: &str) -> anyhow::Result<Self> {
|
||||
pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
|
||||
let mut env: LocalEnv = toml::from_str(toml)?;
|
||||
|
||||
// Find postgres binaries.
|
||||
@@ -228,26 +284,11 @@ impl LocalEnv {
|
||||
env.pg_distrib_dir = cwd.join("tmp_install")
|
||||
}
|
||||
}
|
||||
if !env.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
env.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
// Find zenith binaries.
|
||||
if env.zenith_distrib_dir == Path::new("") {
|
||||
env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
}
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
if !env.zenith_distrib_dir.join(binary).exists() {
|
||||
bail!(
|
||||
"Can't find binary '{}' in zenith distrib dir '{}'",
|
||||
binary,
|
||||
env.zenith_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// If no initial tenant ID was given, generate it.
|
||||
if env.default_tenant_id.is_none() {
|
||||
@@ -341,6 +382,36 @@ impl LocalEnv {
|
||||
"directory '{}' already exists. Perhaps already initialized?",
|
||||
base_path.display()
|
||||
);
|
||||
if !self.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
self.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
||||
bail!(
|
||||
"Can't find binary '{}' in zenith distrib dir '{}'",
|
||||
binary,
|
||||
self.zenith_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
||||
bail!(
|
||||
"Can't find binary '{binary}' in zenith distrib dir '{}'",
|
||||
self.zenith_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
if !self.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
self.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
fs::create_dir(&base_path)?;
|
||||
|
||||
@@ -398,7 +469,35 @@ impl LocalEnv {
|
||||
|
||||
fn base_path() -> PathBuf {
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
Some(val) => PathBuf::from(val),
|
||||
None => PathBuf::from(".zenith"),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn simple_conf_parsing() {
|
||||
let simple_conf_toml = include_str!("../simple.conf");
|
||||
let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
|
||||
assert!(
|
||||
simple_conf_parse_result.is_ok(),
|
||||
"failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
|
||||
);
|
||||
|
||||
let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']";
|
||||
let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']";
|
||||
let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
|
||||
assert!(
|
||||
spoiled_url_toml.contains(spoiled_url_str),
|
||||
"Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
|
||||
);
|
||||
let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
|
||||
assert!(
|
||||
spoiled_url_parse_result.is_err(),
|
||||
"expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,15 +13,17 @@ use nix::unistd::Pid;
|
||||
use postgres::Config;
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use safekeeper::http::models::TimelineCreateRequest;
|
||||
use thiserror::Error;
|
||||
use walkeeper::http::models::TimelineCreateRequest;
|
||||
use zenith_utils::http::error::HttpErrorBody;
|
||||
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
zid::{ZNodeId, ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||
use crate::storage::PageServerNode;
|
||||
use crate::{fill_rust_env_vars, read_pidfile};
|
||||
use zenith_utils::connstring::connection_address;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SafekeeperHttpError {
|
||||
@@ -50,7 +52,7 @@ impl ResponseErrorMessageExt for Response {
|
||||
Err(SafekeeperHttpError::Response(
|
||||
match self.json::<HttpErrorBody>() {
|
||||
Ok(err_body) => format!("Error: {}", err_body.msg),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
Err(_) => format!("Http error ({}) at {url}.", status.as_u16()),
|
||||
},
|
||||
))
|
||||
}
|
||||
@@ -79,8 +81,6 @@ impl SafekeeperNode {
|
||||
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
|
||||
let pageserver = Arc::new(PageServerNode::from_env(env));
|
||||
|
||||
println!("initializing for sk {} for {}", conf.id, conf.http_port);
|
||||
|
||||
SafekeeperNode {
|
||||
id: conf.id,
|
||||
conf: conf.clone(),
|
||||
@@ -136,6 +136,14 @@ impl SafekeeperNode {
|
||||
cmd.arg("--no-sync");
|
||||
}
|
||||
|
||||
let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
|
||||
if !comma_separated_endpoints.is_empty() {
|
||||
cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
|
||||
}
|
||||
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
|
||||
cmd.args(&["--broker-etcd-prefix", prefix]);
|
||||
}
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Safekeeper failed to start. See '{}' for details.",
|
||||
@@ -197,12 +205,13 @@ impl SafekeeperNode {
|
||||
let pid = Pid::from_raw(pid);
|
||||
|
||||
let sig = if immediate {
|
||||
println!("Stop safekeeper immediately");
|
||||
print!("Stopping safekeeper {} immediately..", self.id);
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
println!("Stop safekeeper gracefully");
|
||||
print!("Stopping safekeeper {} gracefully..", self.id);
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
@@ -224,25 +233,35 @@ impl SafekeeperNode {
|
||||
// TODO Remove this "timeout" and handle it on caller side instead.
|
||||
// Shutting down may take a long time,
|
||||
// if safekeeper flushes a lot of data
|
||||
let mut tcp_stopped = false;
|
||||
for _ in 0..100 {
|
||||
if let Err(_e) = TcpStream::connect(&address) {
|
||||
println!("Safekeeper stopped receiving connections");
|
||||
|
||||
//Now check status
|
||||
match self.check_status() {
|
||||
Ok(_) => {
|
||||
println!("Safekeeper status is OK. Wait a bit.");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
Err(err) => {
|
||||
println!("Safekeeper status is: {}", err);
|
||||
return Ok(());
|
||||
if !tcp_stopped {
|
||||
if let Err(err) = TcpStream::connect(&address) {
|
||||
tcp_stopped = true;
|
||||
if err.kind() != io::ErrorKind::ConnectionRefused {
|
||||
eprintln!("\nSafekeeper connection failed with error: {err}");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("Safekeeper still receives connections");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
if tcp_stopped {
|
||||
// Also check status on the HTTP port
|
||||
match self.check_status() {
|
||||
Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => {
|
||||
println!("done!");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => {
|
||||
eprintln!("\nSafekeeper status check failed with error: {err}");
|
||||
return Ok(());
|
||||
}
|
||||
Ok(()) => {
|
||||
// keep waiting
|
||||
}
|
||||
}
|
||||
}
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
|
||||
bail!("Failed to stop safekeeper with pid {}", pid);
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::net::TcpStream;
|
||||
use std::path::PathBuf;
|
||||
@@ -9,21 +10,23 @@ use anyhow::{bail, Context};
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest};
|
||||
use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest};
|
||||
use pageserver::timelines::TimelineInfo;
|
||||
use postgres::{Config, NoTls};
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use zenith_utils::http::error::HttpErrorBody;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::{fill_rust_env_vars, read_pidfile};
|
||||
use pageserver::tenant_mgr::TenantInfo;
|
||||
use zenith_utils::connstring::connection_address;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverHttpError {
|
||||
@@ -118,6 +121,16 @@ impl PageServerNode {
|
||||
);
|
||||
let listen_pg_addr_param =
|
||||
format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
|
||||
let broker_endpoints_param = format!(
|
||||
"broker_endpoints=[{}]",
|
||||
self.env
|
||||
.etcd_broker
|
||||
.broker_endpoints
|
||||
.iter()
|
||||
.map(|url| format!("'{url}'"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
let mut args = Vec::with_capacity(20);
|
||||
|
||||
args.push("--init");
|
||||
@@ -126,8 +139,19 @@ impl PageServerNode {
|
||||
args.extend(["-c", &authg_type_param]);
|
||||
args.extend(["-c", &listen_http_addr_param]);
|
||||
args.extend(["-c", &listen_pg_addr_param]);
|
||||
args.extend(["-c", &broker_endpoints_param]);
|
||||
args.extend(["-c", &id]);
|
||||
|
||||
let broker_etcd_prefix_param = self
|
||||
.env
|
||||
.etcd_broker
|
||||
.broker_etcd_prefix
|
||||
.as_ref()
|
||||
.map(|prefix| format!("broker_etcd_prefix='{prefix}'"));
|
||||
if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
|
||||
args.extend(["-c", broker_etcd_prefix_param]);
|
||||
}
|
||||
|
||||
for config_override in config_overrides {
|
||||
args.extend(["-c", config_override]);
|
||||
}
|
||||
@@ -148,14 +172,25 @@ impl PageServerNode {
|
||||
let initial_timeline_id_string = initial_timeline_id.to_string();
|
||||
args.extend(["--initial-timeline-id", &initial_timeline_id_string]);
|
||||
|
||||
let init_output = fill_rust_env_vars(cmd.args(args))
|
||||
let cmd_with_args = cmd.args(args);
|
||||
let init_output = fill_rust_env_vars(cmd_with_args)
|
||||
.output()
|
||||
.context("pageserver init failed")?;
|
||||
.with_context(|| {
|
||||
format!("failed to init pageserver with command {:?}", cmd_with_args)
|
||||
})?;
|
||||
|
||||
if !init_output.status.success() {
|
||||
bail!("pageserver init failed");
|
||||
bail!(
|
||||
"init invocation failed, {}\nStdout: {}\nStderr: {}",
|
||||
init_output.status,
|
||||
String::from_utf8_lossy(&init_output.stdout),
|
||||
String::from_utf8_lossy(&init_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
// echo the captured output of the init command
|
||||
println!("{}", String::from_utf8_lossy(&init_output.stdout));
|
||||
|
||||
Ok(initial_timeline_id)
|
||||
}
|
||||
|
||||
@@ -175,8 +210,6 @@ impl PageServerNode {
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
|
||||
let repo_path = self.repo_path();
|
||||
let mut args = vec!["-D", repo_path.to_str().unwrap()];
|
||||
|
||||
@@ -184,9 +217,11 @@ impl PageServerNode {
|
||||
args.extend(["-c", config_override]);
|
||||
}
|
||||
|
||||
fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
|
||||
filled_cmd = fill_aws_secrets_vars(filled_cmd);
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
if !filled_cmd.status()?.success() {
|
||||
bail!(
|
||||
"Pageserver failed to start. See '{}' for details.",
|
||||
self.repo_path().join("pageserver.log").display()
|
||||
@@ -246,12 +281,13 @@ impl PageServerNode {
|
||||
let pid = Pid::from_raw(read_pidfile(&pid_file)?);
|
||||
|
||||
let sig = if immediate {
|
||||
println!("Stop pageserver immediately");
|
||||
print!("Stopping pageserver immediately..");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
println!("Stop pageserver gracefully");
|
||||
print!("Stopping pageserver gracefully..");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
@@ -273,25 +309,36 @@ impl PageServerNode {
|
||||
// TODO Remove this "timeout" and handle it on caller side instead.
|
||||
// Shutting down may take a long time,
|
||||
// if pageserver checkpoints a lot of data
|
||||
let mut tcp_stopped = false;
|
||||
for _ in 0..100 {
|
||||
if let Err(_e) = TcpStream::connect(&address) {
|
||||
println!("Pageserver stopped receiving connections");
|
||||
|
||||
//Now check status
|
||||
match self.check_status() {
|
||||
Ok(_) => {
|
||||
println!("Pageserver status is OK. Wait a bit.");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
Err(err) => {
|
||||
println!("Pageserver status is: {}", err);
|
||||
return Ok(());
|
||||
if !tcp_stopped {
|
||||
if let Err(err) = TcpStream::connect(&address) {
|
||||
tcp_stopped = true;
|
||||
if err.kind() != io::ErrorKind::ConnectionRefused {
|
||||
eprintln!("\nPageserver connection failed with error: {err}");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("Pageserver still receives connections");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
if tcp_stopped {
|
||||
// Also check status on the HTTP port
|
||||
|
||||
match self.check_status() {
|
||||
Err(PageserverHttpError::Transport(err)) if err.is_connect() => {
|
||||
println!("done!");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => {
|
||||
eprintln!("\nPageserver status check failed with error: {err}");
|
||||
return Ok(());
|
||||
}
|
||||
Ok(()) => {
|
||||
// keep waiting
|
||||
}
|
||||
}
|
||||
}
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
@@ -334,10 +381,36 @@ impl PageServerNode {
|
||||
pub fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: Option<ZTenantId>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<Option<ZTenantId>> {
|
||||
let tenant_id_string = self
|
||||
.http_request(Method::POST, format!("{}/tenant", self.http_base_url))
|
||||
.json(&TenantCreateRequest { new_tenant_id })
|
||||
.json(&TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
checkpoint_distance: settings
|
||||
.get("checkpoint_distance")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()?,
|
||||
compaction_target_size: settings
|
||||
.get("compaction_target_size")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()?,
|
||||
compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
|
||||
compaction_threshold: settings
|
||||
.get("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
gc_horizon: settings
|
||||
.get("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()?,
|
||||
gc_period: settings.get("gc_period").map(|x| x.to_string()),
|
||||
image_creation_threshold: settings
|
||||
.get("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json::<Option<String>>()?;
|
||||
@@ -354,6 +427,35 @@ impl PageServerNode {
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> {
|
||||
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))
|
||||
.json(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
checkpoint_distance: settings
|
||||
.get("checkpoint_distance")
|
||||
.map(|x| x.parse::<u64>().unwrap()),
|
||||
compaction_target_size: settings
|
||||
.get("compaction_target_size")
|
||||
.map(|x| x.parse::<u64>().unwrap()),
|
||||
compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
|
||||
compaction_threshold: settings
|
||||
.get("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>().unwrap()),
|
||||
gc_horizon: settings
|
||||
.get("gc_horizon")
|
||||
.map(|x| x.parse::<u64>().unwrap()),
|
||||
gc_period: settings.get("gc_period").map(|x| x.to_string()),
|
||||
image_creation_threshold: settings
|
||||
.get("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>().unwrap()),
|
||||
pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||
let timeline_infos: Vec<TimelineInfo> = self
|
||||
.http_request(
|
||||
@@ -391,3 +493,12 @@ impl PageServerNode {
|
||||
Ok(timeline_info_response)
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
|
||||
if let Ok(value) = std::env::var(env_key) {
|
||||
cmd = cmd.env(env_key, value);
|
||||
}
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
@@ -1,13 +1,20 @@
|
||||
#!/bin/sh
|
||||
set -eux
|
||||
|
||||
broker_endpoints_param="${BROKER_ENDPOINT:-absent}"
|
||||
if [ "$broker_endpoints_param" != "absent" ]; then
|
||||
broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']"
|
||||
else
|
||||
broker_endpoints_param=''
|
||||
fi
|
||||
|
||||
if [ "$1" = 'pageserver' ]; then
|
||||
if [ ! -d "/data/tenants" ]; then
|
||||
echo "Initializing pageserver data directory"
|
||||
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10"
|
||||
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param
|
||||
fi
|
||||
echo "Staring pageserver at 0.0.0.0:6400"
|
||||
pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data
|
||||
pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
|
||||
@@ -7,8 +7,8 @@
|
||||
- [glossary.md](glossary.md) — Glossary of all the terms used in codebase.
|
||||
- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
|
||||
- [sourcetree.md](sourcetree.md) — Overview of the source tree layeout.
|
||||
- [pageserver/README](/pageserver/README) — pageserver overview.
|
||||
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
|
||||
- [pageserver/README.md](/pageserver/README.md) — pageserver overview.
|
||||
- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview.
|
||||
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
|
||||
- [walkeeper/README](/walkeeper/README) — WAL service overview.
|
||||
- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview.
|
||||
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
|
||||
|
||||
@@ -27,4 +27,4 @@ management_token = jwt.encode({"scope": "pageserverapi"}, auth_keys.priv, algori
|
||||
tenant_token = jwt.encode({"scope": "tenant", "tenant_id": ps.initial_tenant}, auth_keys.priv, algorithm="RS256")
|
||||
```
|
||||
|
||||
Utility functions to work with jwts in rust are located in zenith_utils/src/auth.rs
|
||||
Utility functions to work with jwts in rust are located in libs/utils/src/auth.rs
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
# Docker images of Zenith
|
||||
# Docker images of Neon
|
||||
|
||||
## Images
|
||||
|
||||
Currently we build two main images:
|
||||
|
||||
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).
|
||||
- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).
|
||||
|
||||
And additional intermediate images:
|
||||
And additional intermediate image:
|
||||
|
||||
- [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools.
|
||||
- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
|
||||
|
||||
## Building pipeline
|
||||
|
||||
1. Image `zenithdb/compute-tools` is re-built automatically.
|
||||
We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
|
||||
|
||||
2. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo.
|
||||
1. `neondatabase/compute-tools` and `neondatabase/compute-node`
|
||||
|
||||
3. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically.
|
||||
2. `neondatabase/neon`
|
||||
|
||||
@@ -21,7 +21,7 @@ NOTE:It has nothing to do with PostgreSQL pg_basebackup.
|
||||
|
||||
### Branch
|
||||
|
||||
We can create branch at certain LSN using `zenith branch` command.
|
||||
We can create branch at certain LSN using `zenith timeline branch` command.
|
||||
Each Branch lives in a corresponding timeline[] and has an ancestor[].
|
||||
|
||||
|
||||
@@ -29,24 +29,32 @@ Each Branch lives in a corresponding timeline[] and has an ancestor[].
|
||||
|
||||
NOTE: This is an overloaded term.
|
||||
|
||||
A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint;
|
||||
A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint;
|
||||
|
||||
### Checkpoint (Layered repository)
|
||||
|
||||
NOTE: This is an overloaded term.
|
||||
|
||||
Whenever enough WAL has been accumulated in memory, the page server []
|
||||
writes out the changes from in-memory layers into new layer files[]. This process
|
||||
is called "checkpointing". The page server only creates layer files for
|
||||
relations that have been modified since the last checkpoint.
|
||||
writes out the changes from the in-memory layer into a new delta layer file. This process
|
||||
is called "checkpointing".
|
||||
|
||||
Configuration parameter `checkpoint_distance` defines the distance
|
||||
from current LSN to perform checkpoint of in-memory layers.
|
||||
Default is `DEFAULT_CHECKPOINT_DISTANCE`.
|
||||
Set this parameter to `0` to force checkpoint of every layer.
|
||||
|
||||
Configuration parameter `checkpoint_period` defines the interval between checkpoint iterations.
|
||||
Default is `DEFAULT_CHECKPOINT_PERIOD`.
|
||||
### Compaction
|
||||
|
||||
A background operation on layer files. Compaction takes a number of L0
|
||||
layer files, each of which covers the whole key space and a range of
|
||||
LSN, and reshuffles the data in them into L1 files so that each file
|
||||
covers the whole LSN range, but only part of the key space.
|
||||
|
||||
Compaction should also opportunistically leave obsolete page versions
|
||||
from the L1 files, and materialize other page versions for faster
|
||||
access. That hasn't been implemented as of this writing, though.
|
||||
|
||||
|
||||
### Compute node
|
||||
|
||||
Stateless Postgres node that stores data in pageserver.
|
||||
@@ -54,10 +62,10 @@ Stateless Postgres node that stores data in pageserver.
|
||||
### Garbage collection
|
||||
|
||||
The process of removing old on-disk layers that are not needed by any timeline anymore.
|
||||
|
||||
### Fork
|
||||
|
||||
Each of the separate segmented file sets in which a relation is stored. The main fork is where the actual data resides. There also exist two secondary forks for metadata: the free space map and the visibility map.
|
||||
Each PostgreSQL fork is considered a separate relish.
|
||||
|
||||
### Layer
|
||||
|
||||
@@ -72,15 +80,15 @@ are immutable. See pageserver/src/layered_repository/README.md for more.
|
||||
### Layer file (on-disk layer)
|
||||
|
||||
Layered repository on-disk format is based on immutable files. The
|
||||
files are called "layer files". Each file corresponds to one RELISH_SEG_SIZE
|
||||
segment of a PostgreSQL relation fork. There are two kinds of layer
|
||||
files: image files and delta files. An image file contains a
|
||||
"snapshot" of the segment at a particular LSN, and a delta file
|
||||
contains WAL records applicable to the segment, in a range of LSNs.
|
||||
files are called "layer files". There are two kinds of layer files:
|
||||
image files and delta files. An image file contains a "snapshot" of a
|
||||
range of keys at a particular LSN, and a delta file contains WAL
|
||||
records applicable to a range of keys, in a range of LSNs.
|
||||
|
||||
### Layer map
|
||||
|
||||
The layer map tracks what layers exist for all the relishes in a timeline.
|
||||
The layer map tracks what layers exist in a timeline.
|
||||
|
||||
### Layered repository
|
||||
|
||||
Zenith repository implementation that keeps data in layers.
|
||||
@@ -100,10 +108,10 @@ PostgreSQL LSNs and functions to monitor them:
|
||||
* `pg_current_wal_lsn()` - Returns the current write-ahead log write location.
|
||||
* `pg_current_wal_flush_lsn()` - Returns the current write-ahead log flush location.
|
||||
* `pg_last_wal_receive_lsn()` - Returns the last write-ahead log location that has been received and synced to disk by streaming replication. While streaming replication is in progress this will increase monotonically.
|
||||
* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
|
||||
* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
|
||||
[source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):
|
||||
|
||||
Zenith safekeeper LSNs. For more check [walkeeper/README_PROTO.md](/walkeeper/README_PROTO.md)
|
||||
Zenith safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
|
||||
* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
|
||||
* `RestartLSN`: position in WAL confirmed by all safekeepers.
|
||||
* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
|
||||
@@ -149,14 +157,6 @@ and create new databases and accounts (control plane API in our case).
|
||||
|
||||
The generic term in PostgreSQL for all objects in a database that have a name and a list of attributes defined in a specific order.
|
||||
|
||||
### Relish
|
||||
|
||||
We call each relation and other file that is stored in the
|
||||
repository a "relish". It comes from "rel"-ish, as in "kind of a
|
||||
rel", because it covers relations as well as other things that are
|
||||
not relations, but are treated similarly for the purposes of the
|
||||
storage layer.
|
||||
|
||||
### Replication slot
|
||||
|
||||
|
||||
@@ -173,33 +173,24 @@ One repository corresponds to one Tenant.
|
||||
|
||||
How much history do we need to keep around for PITR and read-only nodes?
|
||||
|
||||
### Segment (PostgreSQL)
|
||||
|
||||
NOTE: This is an overloaded term.
|
||||
### Segment
|
||||
|
||||
A physical file that stores data for a given relation. File segments are
|
||||
limited in size by a compile-time setting (1 gigabyte by default), so if a
|
||||
relation exceeds that size, it is split into multiple segments.
|
||||
|
||||
### Segment (Layered Repository)
|
||||
|
||||
NOTE: This is an overloaded term.
|
||||
|
||||
Segment is a RELISH_SEG_SIZE slice of relish (identified by a SegmentTag).
|
||||
|
||||
### SLRU
|
||||
|
||||
SLRUs include pg_clog, pg_multixact/members, and
|
||||
pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
|
||||
they don't need to be stored permanently (e.g. pg_subtrans),
|
||||
or we do not support them in zenith yet (pg_commit_ts).
|
||||
Each SLRU segment is considered a separate relish[].
|
||||
|
||||
### Tenant (Multitenancy)
|
||||
Tenant represents a single customer, interacting with Zenith.
|
||||
Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
|
||||
One pageserver[] can serve multiple tenants at once.
|
||||
One safekeeper
|
||||
One safekeeper
|
||||
|
||||
See `docs/multitenancy.md` for more.
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ In addition to the WAL safekeeper nodes, the WAL is archived in
|
||||
S3. WAL that has been archived to S3 can be removed from the
|
||||
safekeepers, so the safekeepers don't need a lot of disk space.
|
||||
|
||||
|
||||
```
|
||||
+----------------+
|
||||
+-----> | WAL safekeeper |
|
||||
| +----------------+
|
||||
@@ -42,23 +42,23 @@ safekeepers, so the safekeepers don't need a lot of disk space.
|
||||
\
|
||||
\
|
||||
\
|
||||
\ +--------+
|
||||
\ | |
|
||||
+--> | S3 |
|
||||
| |
|
||||
+--------+
|
||||
|
||||
\ +--------+
|
||||
\ | |
|
||||
+------> | S3 |
|
||||
| |
|
||||
+--------+
|
||||
|
||||
```
|
||||
Every WAL safekeeper holds a section of WAL, and a VCL value.
|
||||
The WAL can be divided into three portions:
|
||||
|
||||
|
||||
```
|
||||
VCL LSN
|
||||
| |
|
||||
V V
|
||||
.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
|
||||
Archived WAL Completed WAL In-flight WAL
|
||||
|
||||
```
|
||||
|
||||
Note that all this WAL kept in a safekeeper is a contiguous section.
|
||||
This is different from Aurora: In Aurora, there can be holes in the
|
||||
|
||||
@@ -12,7 +12,7 @@ Init empty pageserver using `initdb` in temporary directory.
|
||||
|
||||
`--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/.
|
||||
|
||||
Save`storage_dest` and other parameters in config.
|
||||
Save`storage_dest` and other parameters in config.
|
||||
Push snapshots to `storage_dest` in background.
|
||||
|
||||
```
|
||||
@@ -21,7 +21,7 @@ zenith start
|
||||
```
|
||||
|
||||
#### 2. Restart pageserver (manually or crash-recovery).
|
||||
Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`.
|
||||
Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`.
|
||||
Push snapshots to `storage_dest` in background.
|
||||
|
||||
```
|
||||
@@ -32,7 +32,7 @@ zenith start
|
||||
Start pageserver from existing snapshot.
|
||||
Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...`
|
||||
Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation.
|
||||
Save`storage_dest` parameters in config.
|
||||
Save`storage_dest` parameters in config.
|
||||
Push snapshots to `storage_dest` in background.
|
||||
```
|
||||
//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage.
|
||||
@@ -42,15 +42,15 @@ zenith start
|
||||
How to pass credentials needed for `snapshot_path`?
|
||||
|
||||
#### 4. Export.
|
||||
Manually push snapshot to `snapshot_path` which differs from `storage_dest`
|
||||
Manually push snapshot to `snapshot_path` which differs from `storage_dest`
|
||||
Optionally set `snapshot_format`, which can be plain pgdata format or zenith format.
|
||||
```
|
||||
zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
|
||||
```
|
||||
|
||||
#### Notes and questions
|
||||
- walkeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
|
||||
- safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
|
||||
- Why do we need `zenith init` as a separate command? Can't we init everything at first start?
|
||||
- We can think of better names for all options.
|
||||
- Export to plain postgres format will be useless, if we are not 100% compatible on page level.
|
||||
I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
|
||||
I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
|
||||
|
||||
69
docs/rfcs/014-safekeepers-gossip.md
Normal file
69
docs/rfcs/014-safekeepers-gossip.md
Normal file
@@ -0,0 +1,69 @@
|
||||
# Safekeeper gossip
|
||||
|
||||
Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13)
|
||||
|
||||
## Motivation
|
||||
|
||||
In some situations, safekeeper (SK) needs coordination with other SK's that serve the same tenant:
|
||||
|
||||
1. WAL deletion. SK needs to know what WAL was already safely replicated to delete it. Now we keep WAL indefinitely.
|
||||
2. Deciding on who is sending WAL to the pageserver. Now sending SK crash may lead to a livelock where nobody sends WAL to the pageserver.
|
||||
3. To enable SK to SK direct recovery without involving the compute
|
||||
|
||||
## Summary
|
||||
|
||||
Compute node has connection strings to each safekeeper. During each compute->safekeeper connection establishment, the compute node should pass down all that connection strings to each safekeeper. With that info, safekeepers may establish Postgres connections to each other and periodically send ping messages with LSN payload.
|
||||
|
||||
## Components
|
||||
|
||||
safekeeper, compute, compute<->safekeeper protocol, possibly console (group SK addresses)
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
Each safekeeper can periodically ping all its peers and share connectivity and liveness info. If the ping was not receiver for, let's say, four ping periods, we may consider sending safekeeper as dead. That would mean some of the alive safekeepers should connect to the pageserver. One way to decide which one exactly: `make_connection = my_node_id == min(alive_nodes)`
|
||||
|
||||
Since safekeepers are multi-tenant, we may establish either per-tenant physical connections or per-safekeeper ones. So it makes sense to group "logical" connections between corresponding tenants on different nodes into a single physical connection. That means that we should implement an interconnect thread that maintains physical connections and periodically broadcasts info about all tenants.
|
||||
|
||||
Right now console may assign any 3 SK addresses to a given compute node. That may lead to a high number of gossip connections between SK's. Instead, we can assign safekeeper triples to the compute node. But if we want to "break"/" change" group by an ad-hoc action, we can do it.
|
||||
|
||||
### Corner cases
|
||||
|
||||
- Current safekeeper may be alive but may not have connectivity to the pageserver
|
||||
|
||||
To address that, we need to gossip visibility info. Based on that info, we may define SK as alive only when it can connect to the pageserver.
|
||||
|
||||
- Current safekeeper may be alive but may not have connectivity with the compute node.
|
||||
|
||||
We may broadcast last_received_lsn and presence of compute connection and decide who is alive based on that.
|
||||
|
||||
- It is tricky to decide when to shut down gossip connections because we need to be sure that pageserver got all the committed (in the distributed sense, so local SK info is not enough) records, and it may never lose them. It is not a strict requirement since `--sync-safekeepers` that happen before the compute start will allow the pageserver to consume missing WAL, but it is better to do that in the background. So the condition may look like that: `majority_max(flush_lsn) == pageserver_s3_lsn` Here we rely on the two facts:
|
||||
- that `--sync-safekeepers` happened after the compute shutdown, and it advanced local commit_lsn's allowing pageserver to consume that WAL.
|
||||
|
||||
- we wait for the `pageserver_s3_lsn` advancement to avoid pageserver's last_received_lsn/disk_consistent_lsn going backward due to the disk/hardware failure and subsequent S3 recovery
|
||||
|
||||
If those conditions are not met, we will have some gossip activity (but that may be okay).
|
||||
|
||||
## Pros/cons
|
||||
|
||||
Pros:
|
||||
|
||||
- distributed, does not introduce new services (like etcd), does not add console as a storage dependency
|
||||
- lays the foundation for gossip-based recovery
|
||||
|
||||
Cons:
|
||||
|
||||
- Only compute knows a set of safekeepers, but they should communicate even without compute node. In case of safekeepers restart, we will lose that info and can't gossip anymore. Hence we can't trim some WAL tail until the compute node start. Also, it is ugly.
|
||||
|
||||
- If the console assigns a random set of safekeepers to each Postgres, we may end up in a situation where each safekeeper needs to have a connection with all other safekeepers. We can group safekeepers into isolated triples in the console to avoid that. Then "mixing" would happen only if we do rebalancing.
|
||||
|
||||
## Alternative implementation
|
||||
|
||||
We can have a selected node (e.g., console) with everybody reporting to it.
|
||||
|
||||
## Security implications
|
||||
|
||||
We don't increase the attack surface here. Communication can happen in a private network that is not exposed to users.
|
||||
|
||||
## Scalability implications
|
||||
|
||||
The only thing that may grow as we grow the number of computes is the number of gossip connections. But if we group safekeepers and assign a compute node to the random SK triple, the number of connections would be constant.
|
||||
145
docs/rfcs/014-storage-lsm.md
Normal file
145
docs/rfcs/014-storage-lsm.md
Normal file
@@ -0,0 +1,145 @@
|
||||
# Why LSM trees?
|
||||
|
||||
In general, an LSM tree has the nice property that random updates are
|
||||
fast, but the disk writes are sequential. When a new file is created,
|
||||
it is immutable. New files are created and old ones are deleted, but
|
||||
existing files are never modified. That fits well with storing the
|
||||
files on S3.
|
||||
|
||||
Currently, we create a lot of small files. That is mostly a problem
|
||||
with S3, because each GET/PUT operation is expensive, and LIST
|
||||
operation only returns 1000 objects at a time, and isn't free
|
||||
either. Currently, the files are "archived" together into larger
|
||||
checkpoint files before they're uploaded to S3 to alleviate that
|
||||
problem, but garbage collecting data from the archive files would be
|
||||
difficult and we have not implemented it. This proposal addresses that
|
||||
problem.
|
||||
|
||||
|
||||
# Overview
|
||||
|
||||
|
||||
```
|
||||
^ LSN
|
||||
|
|
||||
| Memtable: +-----------------------------+
|
||||
| | |
|
||||
| +-----------------------------+
|
||||
|
|
||||
|
|
||||
| L0: +-----------------------------+
|
||||
| | |
|
||||
| +-----------------------------+
|
||||
|
|
||||
| +-----------------------------+
|
||||
| | |
|
||||
| +-----------------------------+
|
||||
|
|
||||
| +-----------------------------+
|
||||
| | |
|
||||
| +-----------------------------+
|
||||
|
|
||||
| +-----------------------------+
|
||||
| | |
|
||||
| +-----------------------------+
|
||||
|
|
||||
|
|
||||
| L1: +-------+ +-----+ +--+ +-+
|
||||
| | | | | | | | |
|
||||
| | | | | | | | |
|
||||
| +-------+ +-----+ +--+ +-+
|
||||
|
|
||||
| +----+ +-----+ +--+ +----+
|
||||
| | | | | | | | |
|
||||
| | | | | | | | |
|
||||
| +----+ +-----+ +--+ +----+
|
||||
|
|
||||
+--------------------------------------------------------------> Page ID
|
||||
|
||||
|
||||
+---+
|
||||
| | Layer file
|
||||
+---+
|
||||
```
|
||||
|
||||
|
||||
# Memtable
|
||||
|
||||
When new WAL arrives, it is first put into the Memtable. Despite the
|
||||
name, the Memtable is not a purely in-memory data structure. It can
|
||||
spill to a temporary file on disk if the system is low on memory, and
|
||||
is accessed through a buffer cache.
|
||||
|
||||
If the page server crashes, the Memtable is lost. It is rebuilt by
|
||||
processing again the WAL that's newer than the latest layer in L0.
|
||||
|
||||
The size of the Memtable is configured by the "checkpoint distance"
|
||||
setting. Because anything that hasn't been flushed to disk and
|
||||
uploaded to S3 yet needs to be kept in the safekeeper, the "checkpoint
|
||||
distance" also determines the amount of WAL that needs to kept in the
|
||||
safekeeper.
|
||||
|
||||
# L0
|
||||
|
||||
When the Memtable fills up, it is written out to a new file in L0. The
|
||||
files are immutable; when a file is created, it is never
|
||||
modified. Each file in L0 is roughly 1 GB in size (*). Like the
|
||||
Memtable, each file in L0 covers the whole key range.
|
||||
|
||||
When enough files have been accumulated in L0, compaction
|
||||
starts. Compaction processes all the files in L0 and reshuffles the
|
||||
data to create a new set of files in L1.
|
||||
|
||||
|
||||
(*) except in corner cases like if we want to shut down the page
|
||||
server and want to flush out the memtable to disk even though it's not
|
||||
full yet.
|
||||
|
||||
|
||||
# L1
|
||||
|
||||
L1 consists of ~ 1 GB files like L0. But each file covers only part of
|
||||
the overall key space, and a larger range of LSNs. This speeds up
|
||||
searches. When you're looking for a given page, you need to check all
|
||||
the files in L0, to see if they contain a page version for the requested
|
||||
page. But in L1, you only need to check the files whose key range covers
|
||||
the requested page. This is particularly important at cold start, when
|
||||
checking a file means downloading it from S3.
|
||||
|
||||
Partitioning by key range also helps with garbage collection. If only a
|
||||
part of the database is updated, we will accumulate more files for
|
||||
the hot part in L1, and old files can be removed without affecting the
|
||||
cold part.
|
||||
|
||||
|
||||
# Image layers
|
||||
|
||||
So far, we've only talked about delta layers. In addition to the delta
|
||||
layers, we create image layers, when "enough" WAL has been accumulated
|
||||
for some part of the database. Each image layer covers a 1 GB range of
|
||||
key space. It contains images of the pages at a single LSN, a snapshot
|
||||
if you will.
|
||||
|
||||
The exact heuristic for what "enough" means is not clear yet. Maybe
|
||||
create a new image layer when 10 GB of WAL has been accumulated for a
|
||||
1 GB segment.
|
||||
|
||||
The image layers limit the number of layers that a search needs to
|
||||
check. That put a cap on read latency, and it also allows garbage
|
||||
collecting layers that are older than the GC horizon.
|
||||
|
||||
|
||||
# Partitioning scheme
|
||||
|
||||
When compaction happens and creates a new set of files in L1, how do
|
||||
we partition the data into the files?
|
||||
|
||||
- Goal is that each file is ~ 1 GB in size
|
||||
- Try to match partition boundaries at relation boundaries. (See [1]
|
||||
for how PebblesDB does this, and for why that's important)
|
||||
- Greedy algorithm
|
||||
|
||||
# Additional Reading
|
||||
|
||||
[1] Paper on PebblesDB and how it does partitioning.
|
||||
https://www.cs.utexas.edu/~rak/papers/sosp17-pebblesdb.pdf
|
||||
295
docs/rfcs/015-storage-messaging.md
Normal file
295
docs/rfcs/015-storage-messaging.md
Normal file
@@ -0,0 +1,295 @@
|
||||
# Storage messaging
|
||||
|
||||
Created on 19.01.22
|
||||
|
||||
Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich.
|
||||
|
||||
That it is an alternative to (014-safekeeper-gossip)[]
|
||||
|
||||
## Motivation
|
||||
|
||||
As in 014-safekeeper-gossip we need to solve the following problems:
|
||||
|
||||
* Trim WAL on safekeepers
|
||||
* Decide on which SK should push WAL to the S3
|
||||
* Decide on which SK should forward WAL to the pageserver
|
||||
* Decide on when to shut down SK<->pageserver connection
|
||||
|
||||
This RFC suggests a more generic and hopefully more manageable way to address those problems. However, unlike 014-safekeeper-gossip, it does not bring us any closer to safekeeper-to-safekeeper recovery but rather unties two sets of different issues we previously wanted to solve with gossip.
|
||||
|
||||
Also, with this approach, we would not need "call me maybe" anymore, and the pageserver will have all the data required to understand that it needs to reconnect to another safekeeper.
|
||||
|
||||
## Summary
|
||||
|
||||
Instead of p2p gossip, let's have a centralized broker where all the storage nodes report per-timeline state. Each storage node should have a `--broker-url=1.2.3.4` CLI param.
|
||||
|
||||
Here I propose two ways to do that. After a lot of arguing with myself, I'm leaning towards the etcd approach. My arguments for it are in the pros/cons section. Both options require adding a Grpc client in our codebase either directly or as an etcd dependency.
|
||||
|
||||
## Non-goals
|
||||
|
||||
That RFC does *not* suggest moving the compute to pageserver and compute to safekeeper mappings out of the console. The console is still the only place in the cluster responsible for the persistency of that info. So I'm implying that each pageserver and safekeeper exactly knows what timelines he serves, as it currently is. We need some mechanism for a new pageserver to discover mapping info, but that is out of the scope of this RFC.
|
||||
|
||||
## Impacted components
|
||||
|
||||
pageserver, safekeeper
|
||||
adds either etcd or console as a storage dependency
|
||||
|
||||
## Possible implementation: custom message broker in the console
|
||||
|
||||
We've decided to go with an etcd approach instead of the message broker.
|
||||
|
||||
<details closed>
|
||||
<summary>Original suggestion</summary>
|
||||
<br>
|
||||
We can add a Grpc service in the console that acts as a message broker since the console knows the addresses of all the components. The broker can ignore the payload and only redirect messages. So, for example, each safekeeper may send a message to the peering safekeepers or to the pageserver responsible for a given timeline.
|
||||
|
||||
Message format could be `{sender, destination, payload}`.
|
||||
|
||||
The destination is either:
|
||||
1. `sk_#{tenant}_#{timeline}` -- to be broadcasted on all safekeepers, responsible for that timeline, or
|
||||
2. `pserver_#{tenant}_#{timeline}` -- to be broadcasted on all pageservers, responsible for that timeline
|
||||
|
||||
Sender is either:
|
||||
1. `sk_#{sk_id}`, or
|
||||
2. `pserver_#{pserver_id}`
|
||||
|
||||
I can think of the following behavior to address our original problems:
|
||||
|
||||
* WAL trimming
|
||||
Each safekeeper periodically broadcasts `(write_lsn, commit_lsn)` to all peering (peering == responsible for that timeline) safekeepers
|
||||
|
||||
* Decide on which SK should push WAL to the S3
|
||||
|
||||
Each safekeeper periodically broadcasts `i_am_alive_#{current_timestamp}` message to all peering safekeepers. That way, safekeepers may maintain the vector of alive peers (loose one, with false negatives). Alive safekeeper with the minimal id pushes data to S3.
|
||||
|
||||
* Decide on which SK should forward WAL to the pageserver
|
||||
|
||||
Each safekeeper periodically sends (write_lsn, commit_lsn, compute_connected) to the relevant pageservers. With that info, pageserver can maintain a view of the safekeepers state, connect to a random one, and detect the moments (e.g., one the safekeepers is not making progress or down) when it needs to reconnect to another safekeeper. Pageserver should resolve exact IP addresses through the console, e.g., exchange `#sk_#{sk_id}` to `4.5.6.7:6400`.
|
||||
|
||||
Pageserver connection to the safekeeper triggered by the state change `compute_connected: false -> true`. With that, we don't need "call me maybe" anymore.
|
||||
|
||||
Also, we don't have a "peer address amnesia" problem as in the gossip approach (with gossip, after a simultaneous reboot, safekeepers wouldn't know each other addresses until the next compute connection).
|
||||
|
||||
* Decide on when to shutdown sk<->pageserver connection
|
||||
|
||||
Again, pageserver would have all the info to understand when to shut down the safekeeper connection.
|
||||
|
||||
### Scalability
|
||||
|
||||
One node is enough (c) No, seriously, it is enough.
|
||||
|
||||
### High Availability
|
||||
|
||||
Broker lives in the console, so we can rely on k8s maintaining the console app alive.
|
||||
|
||||
If the console is down, we won't trim WAL and reconnect the pageserver to another safekeeper. But, at the same, if the console is down, we already can't accept new compute connections and start stopped computes, so we are making things a bit worse, but not dramatically.
|
||||
|
||||
### Interactions
|
||||
|
||||
```
|
||||
.________________.
|
||||
sk_1 <-> | | <-> pserver_1
|
||||
... | Console broker | ...
|
||||
sk_n <-> |________________| <-> pserver_m
|
||||
```
|
||||
</details>
|
||||
|
||||
|
||||
## Implementation: etcd state store
|
||||
|
||||
Alternatively, we can set up `etcd` and maintain the following data structure in it:
|
||||
|
||||
```ruby
|
||||
"compute_#{tenant}_#{timeline}" => {
|
||||
safekeepers => {
|
||||
"sk_#{sk_id}" => {
|
||||
write_lsn: "0/AEDF130",
|
||||
commit_lsn: "0/AEDF100",
|
||||
compute_connected: true,
|
||||
last_updated: 1642621138,
|
||||
},
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
As etcd doesn't support field updates in the nested objects that translates to the following set of keys:
|
||||
|
||||
```ruby
|
||||
"compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/write_lsn",
|
||||
"compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/commit_lsn",
|
||||
...
|
||||
```
|
||||
|
||||
Each storage node can subscribe to the relevant sets of keys and maintain a local view of that structure. So in terms of the data flow, everything is the same as in the previous approach. Still, we can avoid implementing the message broker and prevent runtime storage dependency on a console.
|
||||
|
||||
### Safekeeper address discovery
|
||||
|
||||
During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertize something more useful.
|
||||
|
||||
### Safekeeper behavior
|
||||
|
||||
For each timeline safekeeper periodically broadcasts `compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/*` fields. It subscribes to changes of `compute_#{tenant}_#{timeline}` -- that way safekeeper will have an information about peering safekeepers.
|
||||
That amount of information is enough to properly trim WAL. To decide on who is pushing the data to S3 safekeeper may use etcd leases or broadcast a timestamp and hence track who is alive.
|
||||
|
||||
### Pageserver behavior
|
||||
|
||||
Pageserver subscribes to `compute_#{tenant}_#{timeline}` for each tenant it owns. With that info, pageserver can maintain a view of the safekeepers state, connect to a random one, and detect the moments (e.g., one the safekeepers is not making progress or down) when it needs to reconnect to another safekeeper. Pageserver should resolve exact IP addresses through the console, e.g., exchange `#sk_#{sk_id}` to `4.5.6.7:6400`.
|
||||
|
||||
Pageserver connection to the safekeeper can be triggered by the state change `compute_connected: false -> true`. With that, we don't need "call me maybe" anymore.
|
||||
|
||||
As an alternative to compute_connected, we can track timestamp of the latest message arrived to safekeeper from compute. Usually compute broadcasts KeepAlive to all safekeepers every second, so it'll be updated every second when connection is ok. Then the connection can be considered down when this timestamp isn't updated for a several seconds.
|
||||
|
||||
This will help to faster detect issues with safekeeper (and switch to another) in the following cases:
|
||||
|
||||
when compute failed but TCP connection stays alive until timeout (usually about a minute)
|
||||
when safekeeper failed and didn't set compute_connected to false
|
||||
|
||||
Another way to deal with [2] is to process (write_lsn, commit_lsn, compute_connected) as a KeepAlive on the pageserver side and detect issues when sk_id don't send anything for some time. This way is fully compliant to this RFC.
|
||||
|
||||
Also, we don't have a "peer address amnesia" problem as in the gossip approach (with gossip, after a simultaneous reboot, safekeepers wouldn't know each other addresses until the next compute connection).
|
||||
|
||||
### Interactions
|
||||
|
||||
```
|
||||
.________________.
|
||||
sk_1 <-> | | <-> pserver_1
|
||||
... | etcd | ...
|
||||
sk_n <-> |________________| <-> pserver_m
|
||||
```
|
||||
|
||||
### Sequence diagrams for different workflows
|
||||
|
||||
#### Cluster startup
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant C as Compute
|
||||
participant SK1
|
||||
participant SK2
|
||||
participant SK3
|
||||
participant PS1
|
||||
participant PS2
|
||||
participant O as Orchestrator
|
||||
participant M as Metadata Service
|
||||
|
||||
PS1->>M: subscribe to updates to state of timeline N
|
||||
C->>+SK1: WAL push
|
||||
loop constantly update current lsns
|
||||
SK1->>-M: I'm at lsn A
|
||||
end
|
||||
C->>+SK2: WAL push
|
||||
loop constantly update current lsns
|
||||
SK2->>-M: I'm at lsn B
|
||||
end
|
||||
C->>+SK3: WAL push
|
||||
loop constantly update current lsns
|
||||
SK3->>-M: I'm at lsn C
|
||||
end
|
||||
loop request pages
|
||||
C->>+PS1: get_page@lsn
|
||||
PS1->>-C: page image
|
||||
end
|
||||
M->>PS1: New compute appeared for timeline N. SK1 at A, SK2 at B, SK3 at C
|
||||
note over PS1: Say SK1 at A=200, SK2 at B=150 SK3 at C=100 <br> so connect to SK1 because it is the most up to date one
|
||||
PS1->>SK1: start replication
|
||||
```
|
||||
|
||||
#### Behavour of services during typical operations
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant C as Compute
|
||||
participant SK1
|
||||
participant SK2
|
||||
participant SK3
|
||||
participant PS1
|
||||
participant PS2
|
||||
participant O as Orchestrator
|
||||
participant M as Metadata Service
|
||||
|
||||
note over C,M: Scenario 1: Pageserver checkpoint
|
||||
note over PS1: Upload data to S3
|
||||
PS1->>M: Update remote consistent lsn
|
||||
M->>SK1: propagate remote consistent lsn update
|
||||
note over SK1: truncate WAL up to remote consistent lsn
|
||||
M->>SK2: propagate remote consistent lsn update
|
||||
note over SK2: truncate WAL up to remote consistent lsn
|
||||
M->>SK3: propagate remote consistent lsn update
|
||||
note over SK3: truncate WAL up to remote consistent lsn
|
||||
note over C,M: Scenario 2: SK1 finds itself lagging behind MAX(150 (SK2), 200 (SK2)) - 100 (SK1) > THRESHOLD
|
||||
SK1->>SK2: Fetch WAL delta between 100 (SK1) and 200 (SK2)
|
||||
note over C,M: Scenario 3: PS1 detects that SK1 is lagging behind: Connection from SK1 is broken or there is no messages from it in 30 seconds.
|
||||
note over PS1: e.g. SK2 is at 150, SK3 is at 100, chose SK2 as a new replication source
|
||||
PS1->>SK2: start replication
|
||||
```
|
||||
|
||||
#### Behaviour during timeline relocation
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant C as Compute
|
||||
participant SK1
|
||||
participant SK2
|
||||
participant SK3
|
||||
participant PS1
|
||||
participant PS2
|
||||
participant O as Orchestrator
|
||||
participant M as Metadata Service
|
||||
|
||||
note over C,M: Timeline is being relocated from PS1 to PS2
|
||||
O->>+PS2: Attach timeline
|
||||
PS2->>-O: 202 Accepted if timeline exists in S3
|
||||
note over PS2: Download timeline from S3
|
||||
note over O: Poll for timeline download (or subscribe to metadata service)
|
||||
loop wait for attach to complete
|
||||
O->>PS2: timeline detail should answer that timeline is ready
|
||||
end
|
||||
PS2->>M: Register downloaded timeline
|
||||
PS2->>M: Get safekeepers for timeline, subscribe to changes
|
||||
PS2->>SK1: Start replication to catch up
|
||||
note over O: PS2 catched up, time to switch compute
|
||||
O->>C: Restart compute with new pageserver url in config
|
||||
note over C: Wal push is restarted
|
||||
loop request pages
|
||||
C->>+PS2: get_page@lsn
|
||||
PS2->>-C: page image
|
||||
end
|
||||
O->>PS1: detach timeline
|
||||
note over C,M: Scenario 1: Attach call failed
|
||||
O--xPS2: Attach timeline
|
||||
note over O: The operation can be safely retried, <br> if we hit some threshold we can try another pageserver
|
||||
note over C,M: Scenario 2: Attach succeeded but pageserver failed to download the data or start replication
|
||||
loop wait for attach to complete
|
||||
O--xPS2: timeline detail should answer that timeline is ready
|
||||
end
|
||||
note over O: Can wait for a timeout, and then try another pageserver <br> there should be a limit on number of different pageservers to try
|
||||
note over C,M: Scenario 3: Detach fails
|
||||
O--xPS1: Detach timeline
|
||||
note over O: can be retried, if continues to fail might lead to data duplication in s3
|
||||
```
|
||||
|
||||
# Pros/cons
|
||||
|
||||
## Console broker/etcd vs gossip:
|
||||
|
||||
Gossip pros:
|
||||
* gossip allows running storage without the console or etcd
|
||||
|
||||
Console broker/etcd pros:
|
||||
* simpler
|
||||
* solves "call me maybe" as well
|
||||
* avoid possible N-to-N connection issues with gossip without grouping safekeepers in pre-defined triples
|
||||
|
||||
## Console broker vs. etcd:
|
||||
|
||||
Initially, I wanted to avoid etcd as a dependency mostly because I've seen how painful for Clickhouse was their ZooKeeper dependency: in each chat, at each conference, people were complaining about configuration and maintenance barriers with ZooKeeper. It was that bad that ClickHouse re-implemented ZooKeeper to embed it: https://clickhouse.com/docs/en/operations/clickhouse-keeper/.
|
||||
|
||||
But with an etcd we are in a bit different situation:
|
||||
|
||||
1. We don't need persistency and strong consistency guarantees for the data we store in the etcd
|
||||
2. etcd uses Grpc as a protocol, and messages are pretty simple
|
||||
|
||||
So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).
|
||||
151
docs/rfcs/016-connection-routing.md
Normal file
151
docs/rfcs/016-connection-routing.md
Normal file
@@ -0,0 +1,151 @@
|
||||
# Dispatching a connection
|
||||
|
||||
For each client connection, Neon service needs to authenticate the
|
||||
connection, and route it to the right PostgreSQL instance.
|
||||
|
||||
## Authentication
|
||||
|
||||
There are three different ways to authenticate:
|
||||
|
||||
- anonymous; no authentication needed
|
||||
- PostgreSQL authentication
|
||||
- github single sign-on using browser
|
||||
|
||||
In anonymous access, the user doesn't need to perform any
|
||||
authentication at all. This can be used e.g. in interactive PostgreSQL
|
||||
documentation, allowing you to run the examples very quickly. Similar
|
||||
to sqlfiddle.com.
|
||||
|
||||
PostgreSQL authentication works the same as always. All the different
|
||||
PostgreSQL authentication options like SCRAM, kerberos, etc. are
|
||||
available. [1]
|
||||
|
||||
The third option is to authenticate with github single sign-on. When
|
||||
you open the connection in psql, you get a link that you open with
|
||||
your browser. Opening the link redirects you to github authentication,
|
||||
and lets the connection to proceed. This is also known as "Link auth" [2].
|
||||
|
||||
|
||||
## Routing the connection
|
||||
|
||||
When a client starts a connection, it needs to be routed to the
|
||||
correct PostgreSQL instance. Routing can be done by the proxy, acting
|
||||
as a man-in-the-middle, or the connection can be routed at the network
|
||||
level based on the hostname or IP address.
|
||||
|
||||
Either way, Neon needs to identify which PostgreSQL instance the
|
||||
connection should be routed to. If the instance is not already
|
||||
running, it needs to be started. Some connections always require a new
|
||||
PostgreSQL instance to be created, e.g. if you want to run a one-off
|
||||
query against a particular point-in-time.
|
||||
|
||||
The PostgreSQL instance is identified by:
|
||||
- Neon account (possibly anonymous)
|
||||
- cluster (known as tenant in the storage?)
|
||||
- branch or snapshot name
|
||||
- timestamp (PITR)
|
||||
- primary or read-replica
|
||||
- one-off read replica
|
||||
- one-off writeable branch
|
||||
|
||||
When you are using regular PostgreSQL authentication or anonymous
|
||||
access, the connection URL needs to contain all the information needed
|
||||
for the routing. With github single sign-on, the browser is involved
|
||||
and some details - the Neon account in particular - can be deduced
|
||||
from the authentication exchange.
|
||||
|
||||
There are three methods for identifying the PostgreSQL instance:
|
||||
|
||||
- Browser interaction (link auth)
|
||||
- Options in the connection URL and the domain name
|
||||
- A pre-defined endpoint, identified by domain name or IP address
|
||||
|
||||
### Link Auth
|
||||
|
||||
postgres://<username>@start.neon.tech/<dbname>
|
||||
|
||||
This gives you a link that you open in browser. Clicking the link
|
||||
performs github authentication, and the Neon account name is
|
||||
provided to the proxy behind the scenes. The proxy routes the
|
||||
connection to the primary PostgreSQL instance in cluster called
|
||||
"main", branch "main".
|
||||
|
||||
Further ideas:
|
||||
- You could pre-define a different target for link auth
|
||||
connections in the UI.
|
||||
- You could have a drop-down in the browser, allowing you to connect
|
||||
to any cluster you want. Link Auth can be like Teleport.
|
||||
|
||||
### Connection URL
|
||||
|
||||
The connection URL looks like this:
|
||||
|
||||
postgres://<username>@<cluster-id>.db.neon.tech/<dbname>
|
||||
|
||||
By default, this connects you to the primary PostgreSQL instance
|
||||
running on the "main" branch in the named cluster [3]. However, you can
|
||||
change that by specifying options in the connection URL. The following
|
||||
options are supported:
|
||||
|
||||
| option name | Description | Examples |
|
||||
| --- | --- | --- |
|
||||
| cluster | Cluster name | cluster:myproject |
|
||||
| branch | Branch name | branch:main |
|
||||
| timestamp | Connect to an instance at given point-in-time. | timestamp:2022-04-08 timestamp:2022-04-08T11:42:16Z |
|
||||
| lsn | Connect to an instance at given LSN | lsn:0/12FF0420 |
|
||||
| read-replica | Connect to a read-replica. If the parameter is 'new', a new instance is created for this session. | read-replica read-replica:new |
|
||||
|
||||
For example, to read branch 'testing' as it was on Mar 31, 2022, you could
|
||||
specify a timestamp in the connection URL [4]:
|
||||
|
||||
postgres://alice@cluster-1234.db.neon.tech/postgres?options=branch:testing,timestamp:2022-03-31
|
||||
|
||||
Connecting with cluster name and options can be disabled in the UI. If
|
||||
disabled, you can only connect using a pre-defined endpoint.
|
||||
|
||||
### Pre-defined Endpoint
|
||||
|
||||
Instead of providing the cluster name, branch, and all those options
|
||||
in the connection URL, you can define a named endpoint with the same
|
||||
options.
|
||||
|
||||
In the UI, click "create endpoint". Fill in the details:
|
||||
|
||||
- Cluster name
|
||||
- Branch
|
||||
- timestamp or LSN
|
||||
- is this for the primary or for a read replica
|
||||
- etc.
|
||||
|
||||
When you click Finish, a named endpoint is created. You can now use the endpoint ID to connect:
|
||||
|
||||
postgres://<username>@<endpoint-id>.endpoint.neon.tech/<dbname>
|
||||
|
||||
|
||||
An endpoint can be assigned a static or dynamic IP address, so that
|
||||
you can connect to it with clients that don't support TLS SNI. Maybe
|
||||
bypass the proxy altogether, but that ought to be invisible to the
|
||||
user.
|
||||
|
||||
You can limit the range of source IP addresses that are allowed to
|
||||
connect to an endpoint. An endpoint can also be exposed in an Amazon
|
||||
VPC, allowing direct connections from applications.
|
||||
|
||||
|
||||
# Footnotes
|
||||
|
||||
[1] I'm not sure how feasible it is to set up configure like Kerberos
|
||||
or LDAP in a cloud environment. But in principle I think we should
|
||||
allow customers to have the full power of PostgreSQL, including all
|
||||
authentication options. However, it's up to the customer to configure
|
||||
it correctly.
|
||||
|
||||
[2] Link is a way to both authenticate and to route the connection
|
||||
|
||||
[3] This assumes that cluster-ids are globally unique, across all
|
||||
Neon accounts.
|
||||
|
||||
[4] The syntax accepted in the connection URL is limited by libpq. The
|
||||
only way to pass arbitrary options to the server (or our proxy) is
|
||||
with the "options" keyword, and the options must be percent-encoded. I
|
||||
think the above would work but i haven't tested it
|
||||
79
docs/rfcs/cluster-size-limits.md
Normal file
79
docs/rfcs/cluster-size-limits.md
Normal file
@@ -0,0 +1,79 @@
|
||||
Cluster size limits
|
||||
==================
|
||||
|
||||
## Summary
|
||||
|
||||
One of the resource consumption limits for free-tier users is a cluster size limit.
|
||||
|
||||
To enforce it, we need to calculate the timeline size and check if the limit is reached before relation create/extend operations.
|
||||
If the limit is reached, the query must fail with some meaningful error/warning.
|
||||
We may want to exempt some operations from the quota to allow users free space to fit back into the limit.
|
||||
|
||||
The stateless compute node that performs validation is separate from the storage that calculates the usage, so we need to exchange cluster size information between those components.
|
||||
|
||||
## Motivation
|
||||
|
||||
Limit the maximum size of a PostgreSQL instance to limit free tier users (and other tiers in the future).
|
||||
First of all, this is needed to control our free tier production costs.
|
||||
Another reason to limit resources is risk management — we haven't (fully) tested and optimized zenith for big clusters,
|
||||
so we don't want to give users access to the functionality that we don't think is ready.
|
||||
|
||||
## Components
|
||||
|
||||
* pageserver - calculate the size consumed by a timeline and add it to the feedback message.
|
||||
* safekeeper - pass feedback message from pageserver to compute.
|
||||
* compute - receive feedback message, enforce size limit based on GUC `zenith.max_cluster_size`.
|
||||
* console - set and update `zenith.max_cluster_size` setting
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
First of all, it's necessary to define timeline size.
|
||||
|
||||
The current approach is to count all data, including SLRUs. (not including WAL)
|
||||
Here we think of it as a physical disk underneath the Postgres cluster.
|
||||
This is how the `LOGICAL_TIMELINE_SIZE` metric is implemented in the pageserver.
|
||||
|
||||
Alternatively, we could count only relation data. As in pg_database_size().
|
||||
This approach is somewhat more user-friendly because it is the data that is really affected by the user.
|
||||
On the other hand, it puts us in a weaker position than other services, i.e., RDS.
|
||||
We will need to refactor the timeline_size counter or add another counter to implement it.
|
||||
|
||||
Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment.
|
||||
Then this size should be reported to compute node.
|
||||
|
||||
`current_timeline_size` value is included in the walreceiver's custom feedback message: `ZenithFeedback.`
|
||||
|
||||
(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037).
|
||||
|
||||
This message is received by the safekeeper and propagated to compute node as a part of `AppendResponse`.
|
||||
|
||||
Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable.
|
||||
|
||||
And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > zenith.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so.
|
||||
(see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html))
|
||||
|
||||
TODO:
|
||||
We can allow autovacuum processes to bypass this check, simply checking `IsAutoVacuumWorkerProcess()`.
|
||||
It would be nice to allow manual VACUUM and VACUUM FULL to bypass the check, but it's uneasy to distinguish these operations at the low level.
|
||||
See issues https://github.com/neondatabase/neon/issues/1245
|
||||
https://github.com/zenithdb/zenith/issues/1445
|
||||
|
||||
TODO:
|
||||
We should warn users if the limit is soon to be reached.
|
||||
|
||||
### **Reliability, failure modes and corner cases**
|
||||
|
||||
1. `current_timeline_size` is valid at the last received and digested by pageserver lsn.
|
||||
|
||||
If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time.
|
||||
|
||||
So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this?
|
||||
|
||||
Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue.
|
||||
|
||||
|
||||
### **Security implications**
|
||||
|
||||
We treat compute as an untrusted component. That's why we try to isolate it with secure container runtime or a VM.
|
||||
Malicious users may change the `zenith.max_cluster_size`, so we need an extra size limit check.
|
||||
To cover this case, we also monitor the compute node size in the console.
|
||||
@@ -6,7 +6,6 @@ If there's no such file during `init` phase of the server, it creates the file i
|
||||
There's a possibility to pass an arbitrary config value to the pageserver binary as an argument: such values override
|
||||
the values in the config file, if any are specified for the same key and get into the final config during init phase.
|
||||
|
||||
|
||||
### Config example
|
||||
|
||||
```toml
|
||||
@@ -26,18 +25,22 @@ max_file_descriptors = '100'
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
initial_superuser_name = 'zenith_admin'
|
||||
|
||||
broker_etcd_prefix = 'neon'
|
||||
broker_endpoints = ['some://etcd']
|
||||
|
||||
# [remote_storage]
|
||||
```
|
||||
|
||||
The config above shows default values for all basic pageserver settings.
|
||||
The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user,
|
||||
see the corresponding section below.
|
||||
Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank.
|
||||
Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start.
|
||||
|
||||
Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and
|
||||
|
||||
* either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`
|
||||
- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`
|
||||
|
||||
* or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`
|
||||
- or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`
|
||||
|
||||
### Config values
|
||||
|
||||
@@ -47,6 +50,17 @@ Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage=
|
||||
|
||||
Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.
|
||||
|
||||
#### broker_endpoints
|
||||
|
||||
A list of endpoints (etcd currently) to connect and pull the information from.
|
||||
Mandatory, does not have a default, since requires etcd to be started as a separate process,
|
||||
and its connection url should be specified separately.
|
||||
|
||||
#### broker_etcd_prefix
|
||||
|
||||
A prefix to add for every etcd key used, to separate one group of related instances from another, in the same cluster.
|
||||
Default is `neon`.
|
||||
|
||||
#### checkpoint_distance
|
||||
|
||||
`checkpoint_distance` is the amount of incoming WAL that is held in
|
||||
@@ -57,7 +71,7 @@ but it will trigger a checkpoint operation to get it back below the
|
||||
limit.
|
||||
|
||||
`checkpoint_distance` also determines how much WAL needs to be kept
|
||||
durable in the safekeeper. The safekeeper must have capacity to hold
|
||||
durable in the safekeeper. The safekeeper must have capacity to hold
|
||||
this much WAL, with some headroom, otherwise you can get stuck in a
|
||||
situation where the safekeeper is full and stops accepting new WAL,
|
||||
but the pageserver is not flushing out and releasing the space in the
|
||||
@@ -68,11 +82,15 @@ S3.
|
||||
|
||||
The unit is # of bytes.
|
||||
|
||||
#### checkpoint_period
|
||||
#### compaction_period
|
||||
|
||||
The pageserver checks whether `checkpoint_distance` has been reached
|
||||
every `checkpoint_period` seconds. Default is 1 s, which should be
|
||||
fine.
|
||||
Every `compaction_period` seconds, the page server checks if
|
||||
maintenance operations, like compaction, are needed on the layer
|
||||
files. Default is 1 s, which should be fine.
|
||||
|
||||
#### compaction_target_size
|
||||
|
||||
File sizes for L0 delta and L1 image layers. Default is 128MB.
|
||||
|
||||
#### gc_horizon
|
||||
|
||||
@@ -85,6 +103,14 @@ away.
|
||||
|
||||
Interval at which garbage collection is triggered. Default is 100 s.
|
||||
|
||||
#### image_creation_threshold
|
||||
|
||||
L0 delta layer threshold for L1 iamge layer creation. Default is 3.
|
||||
|
||||
#### pitr_interval
|
||||
|
||||
WAL retention duration for PITR branching. Default is 30 days.
|
||||
|
||||
#### initial_superuser_name
|
||||
|
||||
Name of the initial superuser role, passed to initdb when a new tenant
|
||||
@@ -151,13 +177,12 @@ bucket_region = 'eu-north-1'
|
||||
# Optional, pageserver uses entire bucket if the prefix is not specified.
|
||||
prefix_in_bucket = '/some/prefix/'
|
||||
|
||||
# Access key to connect to the bucket ("login" part of the credentials)
|
||||
access_key_id = 'SOMEKEYAAAAASADSAH*#'
|
||||
|
||||
# Secret access key to connect to the bucket ("password" part of the credentials)
|
||||
secret_access_key = 'SOMEsEcReTsd292v'
|
||||
# S3 API query limit to avoid getting errors/throttling from AWS.
|
||||
concurrency_limit = 100
|
||||
```
|
||||
|
||||
If no IAM bucket access is used during the remote storage usage, use the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to set the access credentials.
|
||||
|
||||
###### General remote storage configuration
|
||||
|
||||
Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used.
|
||||
@@ -167,14 +192,13 @@ Besides, there are parameters common for all types of remote storage that can be
|
||||
|
||||
```toml
|
||||
[remote_storage]
|
||||
# Max number of concurrent connections to open for uploading to or downloading from the remote storage.
|
||||
max_concurrent_sync = 100
|
||||
# Max number of concurrent timeline synchronized (layers uploaded or downloaded) with the remote storage at the same time.
|
||||
max_concurrent_syncs = 50
|
||||
|
||||
# Max number of errors a single task can have before it's considered failed and not attempted to run anymore.
|
||||
max_sync_errors = 10
|
||||
```
|
||||
|
||||
|
||||
## safekeeper
|
||||
|
||||
TODO
|
||||
|
||||
@@ -28,12 +28,7 @@ The pageserver has a few different duties:
|
||||
- Receive WAL from the WAL service and decode it.
|
||||
- Replay WAL that's applicable to the chunks that the Page Server maintains
|
||||
|
||||
For more detailed info, see `/pageserver/README`
|
||||
|
||||
`/postgres_ffi`:
|
||||
|
||||
Utility functions for interacting with PostgreSQL file formats.
|
||||
Misc constants, copied from PostgreSQL headers.
|
||||
For more detailed info, see [/pageserver/README](/pageserver/README.md)
|
||||
|
||||
`/proxy`:
|
||||
|
||||
@@ -57,29 +52,38 @@ PostgreSQL extension that implements storage manager API and network communicati
|
||||
|
||||
PostgreSQL extension that contains functions needed for testing and debugging.
|
||||
|
||||
`/walkeeper`:
|
||||
`/safekeeper`:
|
||||
|
||||
The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
|
||||
It acts as a holding area and redistribution center for recently generated WAL.
|
||||
|
||||
For more detailed info, see `/walkeeper/README`
|
||||
For more detailed info, see [/safekeeper/README](/safekeeper/README.md)
|
||||
|
||||
`/workspace_hack`:
|
||||
The workspace_hack crate exists only to pin down some dependencies.
|
||||
|
||||
We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation.
|
||||
|
||||
`/zenith`
|
||||
|
||||
Main entry point for the 'zenith' CLI utility.
|
||||
TODO: Doesn't it belong to control_plane?
|
||||
|
||||
`/zenith_metrics`:
|
||||
`/libs`:
|
||||
Unites granular neon helper crates under the hood.
|
||||
|
||||
`/libs/postgres_ffi`:
|
||||
|
||||
Utility functions for interacting with PostgreSQL file formats.
|
||||
Misc constants, copied from PostgreSQL headers.
|
||||
|
||||
`/libs/utils`:
|
||||
Generic helpers that are shared between other crates in this repository.
|
||||
A subject for future modularization.
|
||||
|
||||
`/libs/metrics`:
|
||||
Helpers for exposing Prometheus metrics from the server.
|
||||
|
||||
`/zenith_utils`:
|
||||
|
||||
Helpers that are shared between other crates in this repository.
|
||||
|
||||
## Using Python
|
||||
Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
|
||||
so manual installation of dependencies is not recommended.
|
||||
@@ -87,18 +91,22 @@ so manual installation of dependencies is not recommended.
|
||||
A single virtual environment with all dependencies is described in the single `Pipfile`.
|
||||
|
||||
### Prerequisites
|
||||
- Install Python 3.7 (the minimal supported version) or greater.
|
||||
- Install Python 3.9 (the minimal supported version) or greater.
|
||||
- Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected.
|
||||
- If you have some trouble with other version you can resolve it by installing Python 3.7 separately, via pyenv or via system package manager e.g.:
|
||||
- If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.:
|
||||
```bash
|
||||
# In Ubuntu
|
||||
sudo add-apt-repository ppa:deadsnakes/ppa
|
||||
sudo apt update
|
||||
sudo apt install python3.7
|
||||
sudo apt install python3.9
|
||||
```
|
||||
- Install `poetry`
|
||||
- Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`.
|
||||
- Install dependencies via `./scripts/pysync`. Note that CI uses Python 3.7 so if you have different version some linting tools can yield different result locally vs in the CI.
|
||||
- Install dependencies via `./scripts/pysync`.
|
||||
- Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile))
|
||||
so if you have different version some linting tools can yield different result locally vs in the CI.
|
||||
- You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.9`.
|
||||
This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning.
|
||||
|
||||
Run `poetry shell` to activate the virtual environment.
|
||||
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
|
||||
|
||||
17
libs/etcd_broker/Cargo.toml
Normal file
17
libs/etcd_broker/Cargo.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[package]
|
||||
name = "etcd_broker"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
etcd-client = "0.9.0"
|
||||
regex = "1.4.5"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "1.12.0"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
tokio = "1"
|
||||
tracing = "0.1"
|
||||
thiserror = "1"
|
||||
348
libs/etcd_broker/src/lib.rs
Normal file
348
libs/etcd_broker/src/lib.rs
Normal file
@@ -0,0 +1,348 @@
|
||||
//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
|
||||
//! Intended to connect services to each other, not to store their data.
|
||||
use std::{
|
||||
collections::{hash_map, HashMap},
|
||||
fmt::Display,
|
||||
str::FromStr,
|
||||
};
|
||||
|
||||
use regex::{Captures, Regex};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
pub use etcd_client::*;
|
||||
|
||||
use tokio::{sync::mpsc, task::JoinHandle};
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId},
|
||||
};
|
||||
|
||||
/// Default value to use for prefixing to all etcd keys with.
|
||||
/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster.
|
||||
pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
struct SafekeeperTimeline {
|
||||
safekeeper_id: ZNodeId,
|
||||
info: SkTimelineInfo,
|
||||
}
|
||||
|
||||
/// Published data about safekeeper's timeline. Fields made optional for easy migrations.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct SkTimelineInfo {
|
||||
/// Term of the last entry.
|
||||
pub last_log_term: Option<u64>,
|
||||
/// LSN of the last record.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub flush_lsn: Option<Lsn>,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub commit_lsn: Option<Lsn>,
|
||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub s3_wal_lsn: Option<Lsn>,
|
||||
/// LSN of last checkpoint uploaded by pageserver.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub remote_consistent_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub peer_horizon_lsn: Option<Lsn>,
|
||||
#[serde(default)]
|
||||
pub safekeeper_connection_string: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum BrokerError {
|
||||
#[error("Etcd client error: {0}. Context: {1}")]
|
||||
EtcdClient(etcd_client::Error, String),
|
||||
#[error("Error during parsing etcd data: {0}")]
|
||||
ParsingError(String),
|
||||
#[error("Internal error: {0}")]
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
/// A way to control the data retrieval from a certain subscription.
|
||||
pub struct SkTimelineSubscription {
|
||||
safekeeper_timeline_updates:
|
||||
mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>>>,
|
||||
kind: SkTimelineSubscriptionKind,
|
||||
watcher_handle: JoinHandle<Result<(), BrokerError>>,
|
||||
watcher: Watcher,
|
||||
}
|
||||
|
||||
impl SkTimelineSubscription {
|
||||
/// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet.
|
||||
pub async fn fetch_data(
|
||||
&mut self,
|
||||
) -> Option<HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>>> {
|
||||
self.safekeeper_timeline_updates.recv().await
|
||||
}
|
||||
|
||||
/// Cancels the subscription, stopping the data poller and waiting for it to shut down.
|
||||
pub async fn cancel(mut self) -> Result<(), BrokerError> {
|
||||
self.watcher.cancel().await.map_err(|e| {
|
||||
BrokerError::EtcdClient(
|
||||
e,
|
||||
format!(
|
||||
"Failed to cancel timeline subscription, kind: {:?}",
|
||||
self.kind
|
||||
),
|
||||
)
|
||||
})?;
|
||||
self.watcher_handle.await.map_err(|e| {
|
||||
BrokerError::InternalError(format!(
|
||||
"Failed to join the timeline updates task, kind: {:?}, error: {e}",
|
||||
self.kind
|
||||
))
|
||||
})?
|
||||
}
|
||||
}
|
||||
|
||||
/// The subscription kind to the timeline updates from safekeeper.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct SkTimelineSubscriptionKind {
|
||||
broker_etcd_prefix: String,
|
||||
kind: SubscriptionKind,
|
||||
}
|
||||
|
||||
impl SkTimelineSubscriptionKind {
|
||||
pub fn all(broker_etcd_prefix: String) -> Self {
|
||||
Self {
|
||||
broker_etcd_prefix,
|
||||
kind: SubscriptionKind::All,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tenant(broker_etcd_prefix: String, tenant: ZTenantId) -> Self {
|
||||
Self {
|
||||
broker_etcd_prefix,
|
||||
kind: SubscriptionKind::Tenant(tenant),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn timeline(broker_etcd_prefix: String, timeline: ZTenantTimelineId) -> Self {
|
||||
Self {
|
||||
broker_etcd_prefix,
|
||||
kind: SubscriptionKind::Timeline(timeline),
|
||||
}
|
||||
}
|
||||
|
||||
fn watch_regex(&self) -> Regex {
|
||||
match self.kind {
|
||||
SubscriptionKind::All => Regex::new(&format!(
|
||||
r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
|
||||
self.broker_etcd_prefix
|
||||
))
|
||||
.expect("wrong regex for 'everything' subscription"),
|
||||
SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!(
|
||||
r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
|
||||
self.broker_etcd_prefix
|
||||
))
|
||||
.expect("wrong regex for 'tenant' subscription"),
|
||||
SubscriptionKind::Timeline(ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}) => Regex::new(&format!(
|
||||
r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$",
|
||||
self.broker_etcd_prefix
|
||||
))
|
||||
.expect("wrong regex for 'timeline' subscription"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Etcd key to use for watching a certain timeline updates from safekeepers.
|
||||
pub fn watch_key(&self) -> String {
|
||||
match self.kind {
|
||||
SubscriptionKind::All => self.broker_etcd_prefix.to_string(),
|
||||
SubscriptionKind::Tenant(tenant_id) => {
|
||||
format!("{}/{tenant_id}/safekeeper", self.broker_etcd_prefix)
|
||||
}
|
||||
SubscriptionKind::Timeline(ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}) => format!(
|
||||
"{}/{tenant_id}/{timeline_id}/safekeeper",
|
||||
self.broker_etcd_prefix
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
enum SubscriptionKind {
|
||||
/// Get every timeline update.
|
||||
All,
|
||||
/// Get certain tenant timelines' updates.
|
||||
Tenant(ZTenantId),
|
||||
/// Get certain timeline updates.
|
||||
Timeline(ZTenantTimelineId),
|
||||
}
|
||||
|
||||
/// Creates a background task to poll etcd for timeline updates from safekeepers.
|
||||
/// Stops and returns `Err` on any error during etcd communication.
|
||||
/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
|
||||
/// exiting normally in such cases.
|
||||
pub async fn subscribe_to_safekeeper_timeline_updates(
|
||||
client: &mut Client,
|
||||
subscription: SkTimelineSubscriptionKind,
|
||||
) -> Result<SkTimelineSubscription, BrokerError> {
|
||||
info!("Subscribing to timeline updates, subscription kind: {subscription:?}");
|
||||
|
||||
let (watcher, mut stream) = client
|
||||
.watch(
|
||||
subscription.watch_key(),
|
||||
Some(WatchOptions::new().with_prefix()),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
BrokerError::EtcdClient(
|
||||
e,
|
||||
format!("Failed to init the watch for subscription {subscription:?}"),
|
||||
)
|
||||
})?;
|
||||
|
||||
let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel();
|
||||
|
||||
let subscription_kind = subscription.kind;
|
||||
let regex = subscription.watch_regex();
|
||||
let watcher_handle = tokio::spawn(async move {
|
||||
while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
|
||||
"Failed to get messages from the subscription stream, kind: {subscription_kind:?}, error: {e}"
|
||||
)))? {
|
||||
if resp.canceled() {
|
||||
info!("Watch for timeline updates subscription was canceled, exiting");
|
||||
break;
|
||||
}
|
||||
|
||||
let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>> = HashMap::new();
|
||||
// Keep track that the timeline data updates from etcd arrive in the right order.
|
||||
// https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas
|
||||
// > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering.
|
||||
let mut timeline_etcd_versions: HashMap<ZTenantTimelineId, i64> = HashMap::new();
|
||||
|
||||
|
||||
let events = resp.events();
|
||||
debug!("Processing {} events", events.len());
|
||||
|
||||
for event in events {
|
||||
if EventType::Put == event.event_type() {
|
||||
if let Some(new_etcd_kv) = event.kv() {
|
||||
let new_kv_version = new_etcd_kv.version();
|
||||
|
||||
match parse_etcd_key_value(subscription_kind, ®ex, new_etcd_kv) {
|
||||
Ok(Some((zttid, timeline))) => {
|
||||
match timeline_updates
|
||||
.entry(zttid)
|
||||
.or_default()
|
||||
.entry(timeline.safekeeper_id)
|
||||
{
|
||||
hash_map::Entry::Occupied(mut o) => {
|
||||
let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN);
|
||||
if old_etcd_kv_version < new_kv_version {
|
||||
o.insert(timeline.info);
|
||||
timeline_etcd_versions.insert(zttid,new_kv_version);
|
||||
}
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(timeline.info);
|
||||
timeline_etcd_versions.insert(zttid,new_kv_version);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None) => {}
|
||||
Err(e) => error!("Failed to parse timeline update: {e}"),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Err(e) = timeline_updates_sender.send(timeline_updates) {
|
||||
info!("Timeline updates sender got dropped, exiting: {e}");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
});
|
||||
|
||||
Ok(SkTimelineSubscription {
|
||||
kind: subscription,
|
||||
safekeeper_timeline_updates,
|
||||
watcher_handle,
|
||||
watcher,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_etcd_key_value(
|
||||
subscription_kind: SubscriptionKind,
|
||||
regex: &Regex,
|
||||
kv: &KeyValue,
|
||||
) -> Result<Option<(ZTenantTimelineId, SafekeeperTimeline)>, BrokerError> {
|
||||
let caps = if let Some(caps) = regex.captures(kv.key_str().map_err(|e| {
|
||||
BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as key str"))
|
||||
})?) {
|
||||
caps
|
||||
} else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let (zttid, safekeeper_id) = match subscription_kind {
|
||||
SubscriptionKind::All => (
|
||||
ZTenantTimelineId::new(
|
||||
parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
|
||||
parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?,
|
||||
),
|
||||
ZNodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?),
|
||||
),
|
||||
SubscriptionKind::Tenant(tenant_id) => (
|
||||
ZTenantTimelineId::new(
|
||||
tenant_id,
|
||||
parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
|
||||
),
|
||||
ZNodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?),
|
||||
),
|
||||
SubscriptionKind::Timeline(zttid) => (
|
||||
zttid,
|
||||
ZNodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?),
|
||||
),
|
||||
};
|
||||
|
||||
let info_str = kv.value_str().map_err(|e| {
|
||||
BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as value str"))
|
||||
})?;
|
||||
Ok(Some((
|
||||
zttid,
|
||||
SafekeeperTimeline {
|
||||
safekeeper_id,
|
||||
info: serde_json::from_str(info_str).map_err(|e| {
|
||||
BrokerError::ParsingError(format!(
|
||||
"Failed to parse '{info_str}' as safekeeper timeline info: {e}"
|
||||
))
|
||||
})?,
|
||||
},
|
||||
)))
|
||||
}
|
||||
|
||||
fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: Display,
|
||||
{
|
||||
let capture_match = caps
|
||||
.get(index)
|
||||
.ok_or_else(|| format!("Failed to get capture match at index {index}"))?
|
||||
.as_str();
|
||||
capture_match.parse().map_err(|e| {
|
||||
format!(
|
||||
"Failed to parse {} from {capture_match}: {e}",
|
||||
std::any::type_name::<T>()
|
||||
)
|
||||
})
|
||||
}
|
||||
11
libs/metrics/Cargo.toml
Normal file
11
libs/metrics/Cargo.toml
Normal file
@@ -0,0 +1,11 @@
|
||||
[package]
|
||||
name = "metrics"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
libc = "0.2"
|
||||
lazy_static = "1.4"
|
||||
once_cell = "1.8.0"
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
@@ -3,7 +3,6 @@
|
||||
//! Otherwise, we might not see all metrics registered via
|
||||
//! a default registry.
|
||||
use lazy_static::lazy_static;
|
||||
use once_cell::race::OnceBox;
|
||||
pub use prometheus::{exponential_buckets, linear_buckets};
|
||||
pub use prometheus::{register_gauge, Gauge};
|
||||
pub use prometheus::{register_gauge_vec, GaugeVec};
|
||||
@@ -27,48 +26,15 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
|
||||
prometheus::gather()
|
||||
}
|
||||
|
||||
static COMMON_METRICS_PREFIX: OnceBox<&str> = OnceBox::new();
|
||||
|
||||
/// Sets a prefix which will be used for all common metrics, typically a service
|
||||
/// name like 'pageserver'. Should be executed exactly once in the beginning of
|
||||
/// any executable which uses common metrics.
|
||||
pub fn set_common_metrics_prefix(prefix: &'static str) {
|
||||
// Not unwrap() because metrics may be initialized after multiple threads have been started.
|
||||
COMMON_METRICS_PREFIX
|
||||
.set(prefix.into())
|
||||
.unwrap_or_else(|_| {
|
||||
eprintln!(
|
||||
"set_common_metrics_prefix() was called second time with '{}', exiting",
|
||||
prefix
|
||||
);
|
||||
std::process::exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
/// Prepends a prefix to a common metric name so they are distinguished between
|
||||
/// different services, see <https://github.com/zenithdb/zenith/pull/681>
|
||||
/// A call to set_common_metrics_prefix() is necessary prior to calling this.
|
||||
pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String {
|
||||
// Not unwrap() because metrics may be initialized after multiple threads have been started.
|
||||
format!(
|
||||
"{}_{}",
|
||||
COMMON_METRICS_PREFIX.get().unwrap_or_else(|| {
|
||||
eprintln!("set_common_metrics_prefix() was not called, but metrics are used, exiting");
|
||||
std::process::exit(1);
|
||||
}),
|
||||
unprefixed_metric_name
|
||||
)
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!(
|
||||
new_common_metric_name("disk_io_bytes"),
|
||||
"libmetrics_disk_io_bytes_total",
|
||||
"Bytes written and read from disk, grouped by the operation (read|write)",
|
||||
&["io_operation"]
|
||||
)
|
||||
.expect("Failed to register disk i/o bytes int gauge vec");
|
||||
static ref MAXRSS_KB: IntGauge = register_int_gauge!(
|
||||
new_common_metric_name("maxrss_kb"),
|
||||
"libmetrics_maxrss_kb",
|
||||
"Memory usage (Maximum Resident Set Size)"
|
||||
)
|
||||
.expect("Failed to register maxrss_kb int gauge");
|
||||
@@ -8,8 +8,8 @@ use std::io::{Read, Result, Write};
|
||||
///
|
||||
/// ```
|
||||
/// # use std::io::{Result, Read};
|
||||
/// # use zenith_metrics::{register_int_counter, IntCounter};
|
||||
/// # use zenith_metrics::CountedReader;
|
||||
/// # use metrics::{register_int_counter, IntCounter};
|
||||
/// # use metrics::CountedReader;
|
||||
/// #
|
||||
/// # lazy_static::lazy_static! {
|
||||
/// # static ref INT_COUNTER: IntCounter = register_int_counter!(
|
||||
@@ -83,8 +83,8 @@ impl<T: Read> Read for CountedReader<'_, T> {
|
||||
///
|
||||
/// ```
|
||||
/// # use std::io::{Result, Write};
|
||||
/// # use zenith_metrics::{register_int_counter, IntCounter};
|
||||
/// # use zenith_metrics::CountedWriter;
|
||||
/// # use metrics::{register_int_counter, IntCounter};
|
||||
/// # use metrics::CountedWriter;
|
||||
/// #
|
||||
/// # lazy_static::lazy_static! {
|
||||
/// # static ref INT_COUNTER: IntCounter = register_int_counter!(
|
||||
@@ -17,8 +17,8 @@ log = "0.4.14"
|
||||
memoffset = "0.6.2"
|
||||
thiserror = "1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
utils = { path = "../utils" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[build-dependencies]
|
||||
bindgen = "0.59.1"
|
||||
@@ -88,8 +88,8 @@ fn main() {
|
||||
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
|
||||
// but this will do for now.
|
||||
//
|
||||
.clang_arg("-I../tmp_install/include/server")
|
||||
.clang_arg("-I../tmp_install/include/postgresql/server")
|
||||
.clang_arg("-I../../tmp_install/include/server")
|
||||
.clang_arg("-I../../tmp_install/include/postgresql/server")
|
||||
//
|
||||
// Finish the builder and generate the bindings.
|
||||
//
|
||||
@@ -43,7 +43,7 @@ impl ControlFileData {
|
||||
/// Interpret a slice of bytes as a Postgres control file.
|
||||
///
|
||||
pub fn decode(buf: &[u8]) -> Result<ControlFileData> {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
use utils::bin_ser::LeSer;
|
||||
|
||||
// Check that the slice has the expected size. The control file is
|
||||
// padded with zeros up to a 512 byte sector size, so accept a
|
||||
@@ -77,7 +77,7 @@ impl ControlFileData {
|
||||
///
|
||||
/// The CRC is recomputed to match the contents of the fields.
|
||||
pub fn encode(&self) -> Bytes {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
use utils::bin_ser::LeSer;
|
||||
|
||||
// Serialize into a new buffer.
|
||||
let b = self.ser().unwrap();
|
||||
@@ -8,6 +8,7 @@
|
||||
#![allow(deref_nullptr)]
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
|
||||
@@ -37,3 +38,21 @@ pub const fn transaction_id_precedes(id1: TransactionId, id2: TransactionId) ->
|
||||
let diff = id1.wrapping_sub(id2) as i32;
|
||||
diff < 0
|
||||
}
|
||||
|
||||
// Check if page is not yet initialized (port of Postgres PageIsInit() macro)
|
||||
pub fn page_is_new(pg: &[u8]) -> bool {
|
||||
pg[14] == 0 && pg[15] == 0 // pg_upper == 0
|
||||
}
|
||||
|
||||
// ExtractLSN from page header
|
||||
pub fn page_get_lsn(pg: &[u8]) -> Lsn {
|
||||
Lsn(
|
||||
((u32::from_le_bytes(pg[0..4].try_into().unwrap()) as u64) << 32)
|
||||
| u32::from_le_bytes(pg[4..8].try_into().unwrap()) as u64,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) {
|
||||
pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
|
||||
pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
|
||||
}
|
||||
@@ -24,6 +24,9 @@ pub const VISIBILITYMAP_FORKNUM: u8 = 2;
|
||||
pub const INIT_FORKNUM: u8 = 3;
|
||||
|
||||
// From storage_xlog.h
|
||||
pub const XLOG_SMGR_CREATE: u8 = 0x10;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
|
||||
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
|
||||
pub const SMGR_TRUNCATE_VM: u32 = 0x0002;
|
||||
pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
|
||||
@@ -113,7 +116,6 @@ pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
|
||||
// From pg_control.h and rmgrlist.h
|
||||
pub const XLOG_NEXTOID: u8 = 0x30;
|
||||
pub const XLOG_SWITCH: u8 = 0x40;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
pub const XLOG_FPI_FOR_HINT: u8 = 0xA0;
|
||||
pub const XLOG_FPI: u8 = 0xB0;
|
||||
pub const DB_SHUTDOWNED: u32 = 1;
|
||||
@@ -4,7 +4,7 @@
|
||||
//! This understands the WAL page and record format, enough to figure out where the WAL record
|
||||
//! boundaries are, and to reassemble WAL records that cross page boundaries.
|
||||
//!
|
||||
//! This functionality is needed by both the pageserver and the walkeepers. The pageserver needs
|
||||
//! This functionality is needed by both the pageserver and the safekeepers. The pageserver needs
|
||||
//! to look deeper into the WAL records to also understand which blocks they modify, the code
|
||||
//! for that is in pageserver/src/walrecord.rs
|
||||
//!
|
||||
@@ -18,7 +18,7 @@ use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use thiserror::Error;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub struct WalStreamDecoder {
|
||||
lsn: Lsn,
|
||||
@@ -89,7 +89,12 @@ impl WalStreamDecoder {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf);
|
||||
let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
|
||||
WalDecodeError {
|
||||
msg: format!("long header deserialization failed {}", e),
|
||||
lsn: self.lsn,
|
||||
}
|
||||
})?;
|
||||
|
||||
if hdr.std.xlp_pageaddr != self.lsn.0 {
|
||||
return Err(WalDecodeError {
|
||||
@@ -106,7 +111,12 @@ impl WalStreamDecoder {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf);
|
||||
let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
|
||||
WalDecodeError {
|
||||
msg: format!("header deserialization failed {}", e),
|
||||
lsn: self.lsn,
|
||||
}
|
||||
})?;
|
||||
|
||||
if hdr.xlp_pageaddr != self.lsn.0 {
|
||||
return Err(WalDecodeError {
|
||||
@@ -188,7 +198,13 @@ impl WalStreamDecoder {
|
||||
}
|
||||
|
||||
// We now have a record in the 'recordbuf' local variable.
|
||||
let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);
|
||||
let xlogrec =
|
||||
XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| {
|
||||
WalDecodeError {
|
||||
msg: format!("xlog record deserialization failed {}", e),
|
||||
lsn: self.lsn,
|
||||
}
|
||||
})?;
|
||||
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
|
||||
@@ -15,7 +15,7 @@ use crate::XLogPageHeaderData;
|
||||
use crate::XLogRecord;
|
||||
use crate::XLOG_PAGE_MAGIC;
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use anyhow::bail;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::BytesMut;
|
||||
use bytes::{Buf, Bytes};
|
||||
@@ -28,7 +28,9 @@ use std::io::prelude::*;
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::SystemTime;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
pub const XLOG_BLCKSZ: usize = 8192;
|
||||
@@ -118,11 +120,15 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn {
|
||||
}
|
||||
|
||||
pub fn get_current_timestamp() -> TimestampTz {
|
||||
to_pg_timestamp(SystemTime::now())
|
||||
}
|
||||
|
||||
pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
|
||||
const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
|
||||
const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
|
||||
const SECS_PER_DAY: u64 = 86400;
|
||||
const USECS_PER_SEC: u64 = 1000000;
|
||||
match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) {
|
||||
match time.duration_since(SystemTime::UNIX_EPOCH) {
|
||||
Ok(n) => {
|
||||
((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
|
||||
* USECS_PER_SEC
|
||||
@@ -140,7 +146,7 @@ fn find_end_of_wal_segment(
|
||||
tli: TimeLineID,
|
||||
wal_seg_size: usize,
|
||||
start_offset: usize, // start reading at this point
|
||||
) -> Result<u32> {
|
||||
) -> anyhow::Result<u32> {
|
||||
// step back to the beginning of the page to read it in...
|
||||
let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ;
|
||||
let mut contlen: usize = 0;
|
||||
@@ -268,7 +274,7 @@ pub fn find_end_of_wal(
|
||||
wal_seg_size: usize,
|
||||
precise: bool,
|
||||
start_lsn: Lsn, // start reading WAL at this point or later
|
||||
) -> Result<(XLogRecPtr, TimeLineID)> {
|
||||
) -> anyhow::Result<(XLogRecPtr, TimeLineID)> {
|
||||
let mut high_segno: XLogSegNo = 0;
|
||||
let mut high_tli: TimeLineID = 0;
|
||||
let mut high_ispartial = false;
|
||||
@@ -350,19 +356,19 @@ pub fn main() {
|
||||
}
|
||||
|
||||
impl XLogRecord {
|
||||
pub fn from_slice(buf: &[u8]) -> XLogRecord {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
XLogRecord::des(buf).unwrap()
|
||||
pub fn from_slice(buf: &[u8]) -> Result<XLogRecord, DeserializeError> {
|
||||
use utils::bin_ser::LeSer;
|
||||
XLogRecord::des(buf)
|
||||
}
|
||||
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogRecord {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
XLogRecord::des_from(&mut buf.reader()).unwrap()
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogRecord, DeserializeError> {
|
||||
use utils::bin_ser::LeSer;
|
||||
XLogRecord::des_from(&mut buf.reader())
|
||||
}
|
||||
|
||||
pub fn encode(&self) -> Bytes {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
self.ser().unwrap().into()
|
||||
pub fn encode(&self) -> Result<Bytes, SerializeError> {
|
||||
use utils::bin_ser::LeSer;
|
||||
Ok(self.ser()?.into())
|
||||
}
|
||||
|
||||
// Is this record an XLOG_SWITCH record? They need some special processing,
|
||||
@@ -372,35 +378,35 @@ impl XLogRecord {
|
||||
}
|
||||
|
||||
impl XLogPageHeaderData {
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogPageHeaderData {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
XLogPageHeaderData::des_from(&mut buf.reader()).unwrap()
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogPageHeaderData, DeserializeError> {
|
||||
use utils::bin_ser::LeSer;
|
||||
XLogPageHeaderData::des_from(&mut buf.reader())
|
||||
}
|
||||
}
|
||||
|
||||
impl XLogLongPageHeaderData {
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogLongPageHeaderData {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
XLogLongPageHeaderData::des_from(&mut buf.reader()).unwrap()
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogLongPageHeaderData, DeserializeError> {
|
||||
use utils::bin_ser::LeSer;
|
||||
XLogLongPageHeaderData::des_from(&mut buf.reader())
|
||||
}
|
||||
|
||||
pub fn encode(&self) -> Bytes {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
self.ser().unwrap().into()
|
||||
pub fn encode(&self) -> Result<Bytes, SerializeError> {
|
||||
use utils::bin_ser::LeSer;
|
||||
self.ser().map(|b| b.into())
|
||||
}
|
||||
}
|
||||
|
||||
pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();
|
||||
|
||||
impl CheckPoint {
|
||||
pub fn encode(&self) -> Bytes {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
self.ser().unwrap().into()
|
||||
pub fn encode(&self) -> Result<Bytes, SerializeError> {
|
||||
use utils::bin_ser::LeSer;
|
||||
Ok(self.ser()?.into())
|
||||
}
|
||||
|
||||
pub fn decode(buf: &[u8]) -> Result<CheckPoint, anyhow::Error> {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
Ok(CheckPoint::des(buf)?)
|
||||
pub fn decode(buf: &[u8]) -> Result<CheckPoint, DeserializeError> {
|
||||
use utils::bin_ser::LeSer;
|
||||
CheckPoint::des(buf)
|
||||
}
|
||||
|
||||
/// Update next XID based on provided new_xid and stored epoch.
|
||||
@@ -438,7 +444,7 @@ impl CheckPoint {
|
||||
// Generate new, empty WAL segment.
|
||||
// We need this segment to start compute node.
|
||||
//
|
||||
pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
|
||||
pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
|
||||
let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);
|
||||
|
||||
let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE);
|
||||
@@ -458,12 +464,12 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
|
||||
xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
|
||||
};
|
||||
|
||||
let hdr_bytes = hdr.encode();
|
||||
let hdr_bytes = hdr.encode()?;
|
||||
seg_buf.extend_from_slice(&hdr_bytes);
|
||||
|
||||
//zero out the rest of the file
|
||||
seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
|
||||
seg_buf.freeze()
|
||||
Ok(seg_buf.freeze())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -477,7 +483,9 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_find_end_of_wal() {
|
||||
// 1. Run initdb to generate some WAL
|
||||
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("..");
|
||||
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
.join("..");
|
||||
let data_dir = top_path.join("test_output/test_find_end_of_wal");
|
||||
let initdb_path = top_path.join("tmp_install/bin/initdb");
|
||||
let lib_path = top_path.join("tmp_install/lib");
|
||||
@@ -495,7 +503,13 @@ mod tests {
|
||||
.env("DYLD_LIBRARY_PATH", &lib_path)
|
||||
.output()
|
||||
.unwrap();
|
||||
assert!(initdb_output.status.success());
|
||||
assert!(
|
||||
initdb_output.status.success(),
|
||||
"initdb failed. Status: '{}', stdout: '{}', stderr: '{}'",
|
||||
initdb_output.status,
|
||||
String::from_utf8_lossy(&initdb_output.stdout),
|
||||
String::from_utf8_lossy(&initdb_output.stderr),
|
||||
);
|
||||
|
||||
// 2. Pick WAL generated by initdb
|
||||
let wal_dir = data_dir.join("pg_wal");
|
||||
20
libs/remote_storage/Cargo.toml
Normal file
20
libs/remote_storage/Cargo.toml
Normal file
@@ -0,0 +1,20 @@
|
||||
[package]
|
||||
name = "remote_storage"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
tracing = "0.1.27"
|
||||
rusoto_core = "0.48"
|
||||
rusoto_s3 = "0.48"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
async-trait = "0.1"
|
||||
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.2"
|
||||
233
libs/remote_storage/src/lib.rs
Normal file
233
libs/remote_storage/src/lib.rs
Normal file
@@ -0,0 +1,233 @@
|
||||
//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
|
||||
//! No other modules from this tree are supposed to be used directly by the external code.
|
||||
//!
|
||||
//! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
|
||||
//! * [`local_fs`] allows to use local file system as an external storage
|
||||
//! * [`s3_bucket`] uses AWS S3 bucket as an external storage
|
||||
//!
|
||||
mod local_fs;
|
||||
mod s3_bucket;
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
ffi::OsStr,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use tokio::io;
|
||||
use tracing::info;
|
||||
|
||||
pub use self::{
|
||||
local_fs::LocalFs,
|
||||
s3_bucket::{S3Bucket, S3ObjectKey},
|
||||
};
|
||||
|
||||
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
|
||||
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
|
||||
/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
|
||||
/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RemoteStorage: Send + Sync {
|
||||
/// A way to uniquely reference a file in the remote storage.
|
||||
type RemoteObjectId;
|
||||
|
||||
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
|
||||
|
||||
/// Gets the download path of the given storage file.
|
||||
fn local_path(&self, remote_object_id: &Self::RemoteObjectId) -> anyhow::Result<PathBuf>;
|
||||
|
||||
/// Lists all items the storage has right now.
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
from_size_bytes: usize,
|
||||
to: &Self::RemoteObjectId,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
||||
|
||||
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
||||
|
||||
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
/// Every storage, currently supported.
|
||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||
pub enum GenericRemoteStorage {
|
||||
Local(LocalFs),
|
||||
S3(S3Bucket),
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
pub fn new(
|
||||
working_directory: PathBuf,
|
||||
storage_config: &RemoteStorageConfig,
|
||||
) -> anyhow::Result<Self> {
|
||||
match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => {
|
||||
info!("Using fs root '{}' as a remote storage", root.display());
|
||||
LocalFs::new(root.clone(), working_directory).map(GenericRemoteStorage::Local)
|
||||
}
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
S3Bucket::new(s3_config, working_directory).map(GenericRemoteStorage::S3)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
|
||||
/// Immutable, cannot be changed once the file is created.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct StorageMetadata(HashMap<String, String>);
|
||||
|
||||
fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
|
||||
if prefix == path {
|
||||
anyhow::bail!(
|
||||
"Prefix and the path are equal, cannot strip: '{}'",
|
||||
prefix.display()
|
||||
)
|
||||
} else {
|
||||
path.strip_prefix(prefix).with_context(|| {
|
||||
format!(
|
||||
"Path '{}' is not prefixed with '{}'",
|
||||
path.display(),
|
||||
prefix.display(),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// Max allowed number of concurrent sync operations between the API user and the remote storage.
|
||||
pub max_concurrent_syncs: NonZeroUsize,
|
||||
/// Max allowed errors before the sync task is considered failed and evicted.
|
||||
pub max_sync_errors: NonZeroU32,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored files into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
/// A base URL to send S3 requests to.
|
||||
/// By default, the endpoint is derived from a region name, assuming it's
|
||||
/// an AWS S3 region name, erroring on wrong region name.
|
||||
/// Endpoint provides a way to support other S3 flavors and their regions.
|
||||
///
|
||||
/// Example: `http://127.0.0.1:5000`
|
||||
pub endpoint: Option<String>,
|
||||
/// AWS S3 has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
|
||||
let new_extension = match original_path
|
||||
.as_ref()
|
||||
.extension()
|
||||
.map(OsStr::to_string_lossy)
|
||||
{
|
||||
Some(extension) => Cow::Owned(format!("{extension}.{suffix}")),
|
||||
None => Cow::Borrowed(suffix),
|
||||
};
|
||||
original_path
|
||||
.as_ref()
|
||||
.with_extension(new_extension.as_ref())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_path_with_suffix_extension() {
|
||||
let p = PathBuf::from("/foo/bar");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp").to_string_lossy(),
|
||||
"/foo/bar.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
|
||||
"/foo/bar.temp.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar.baz");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
|
||||
"/foo/bar.baz.temp.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar.baz");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, ".temp").to_string_lossy(),
|
||||
"/foo/bar.baz..temp"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,10 @@
|
||||
//! Local filesystem acting as a remote storage.
|
||||
//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
|
||||
//! Multiple API users can use the same "storage" of this kind by using different storage roots.
|
||||
//!
|
||||
//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
|
||||
//! This storage used in tests, but can also be used in cases when a certain persistent
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
ffi::OsString,
|
||||
future::Future,
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
@@ -18,16 +17,18 @@ use tokio::{
|
||||
};
|
||||
use tracing::*;
|
||||
|
||||
use super::{strip_path_prefix, RemoteStorage};
|
||||
use crate::path_with_suffix_extension;
|
||||
|
||||
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
|
||||
|
||||
pub struct LocalFs {
|
||||
pageserver_workdir: &'static Path,
|
||||
root: PathBuf,
|
||||
working_directory: PathBuf,
|
||||
storage_root: PathBuf,
|
||||
}
|
||||
|
||||
impl LocalFs {
|
||||
/// Attempts to create local FS storage, along with its root directory.
|
||||
pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
|
||||
pub fn new(root: PathBuf, working_directory: PathBuf) -> anyhow::Result<Self> {
|
||||
if !root.exists() {
|
||||
std::fs::create_dir_all(&root).with_context(|| {
|
||||
format!(
|
||||
@@ -37,15 +38,15 @@ impl LocalFs {
|
||||
})?;
|
||||
}
|
||||
Ok(Self {
|
||||
pageserver_workdir,
|
||||
root,
|
||||
working_directory,
|
||||
storage_root: root,
|
||||
})
|
||||
}
|
||||
|
||||
fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
|
||||
if path.is_relative() {
|
||||
Ok(self.root.join(path))
|
||||
} else if path.starts_with(&self.root) {
|
||||
Ok(self.storage_root.join(path))
|
||||
} else if path.starts_with(&self.storage_root) {
|
||||
Ok(path.to_path_buf())
|
||||
} else {
|
||||
bail!(
|
||||
@@ -54,46 +55,68 @@ impl LocalFs {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn read_storage_metadata(
|
||||
&self,
|
||||
file_path: &Path,
|
||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
||||
let metadata_path = storage_metadata_path(file_path);
|
||||
if metadata_path.exists() && metadata_path.is_file() {
|
||||
let metadata_string = fs::read_to_string(&metadata_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to read metadata from the local storage at '{}'",
|
||||
metadata_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
serde_json::from_str(&metadata_string)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize metadata from the local storage at '{}'",
|
||||
metadata_path.display()
|
||||
)
|
||||
})
|
||||
.map(|metadata| Some(StorageMetadata(metadata)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
type StoragePath = PathBuf;
|
||||
type RemoteObjectId = PathBuf;
|
||||
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
|
||||
Ok(self.root.join(
|
||||
strip_path_prefix(self.pageserver_workdir, local_path)
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId> {
|
||||
Ok(self.storage_root.join(
|
||||
strip_path_prefix(&self.working_directory, local_path)
|
||||
.context("local path does not belong to this storage")?,
|
||||
))
|
||||
}
|
||||
|
||||
fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
|
||||
let relative_path = strip_path_prefix(&self.root, storage_path)
|
||||
fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result<PathBuf> {
|
||||
let relative_path = strip_path_prefix(&self.storage_root, storage_path)
|
||||
.context("local path does not belong to this storage")?;
|
||||
Ok(self.pageserver_workdir.join(relative_path))
|
||||
Ok(self.working_directory.join(relative_path))
|
||||
}
|
||||
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
|
||||
get_all_files(&self.root).await
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||
get_all_files(&self.storage_root).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
to: &Self::StoragePath,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &Self::RemoteObjectId,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let target_file_path = self.resolve_in_storage(to)?;
|
||||
create_target_directory(&target_file_path).await?;
|
||||
// We need this dance with sort of durable rename (without fsyncs)
|
||||
// to prevent partial uploads. This was really hit when pageserver shutdown
|
||||
// cancelled the upload and partial file was left on the fs
|
||||
let mut temp_extension = target_file_path
|
||||
.extension()
|
||||
.unwrap_or_default()
|
||||
.to_os_string();
|
||||
|
||||
temp_extension.push(OsString::from(".temp"));
|
||||
let temp_file_path = target_file_path.with_extension(temp_extension);
|
||||
let temp_file_path = path_with_suffix_extension(&target_file_path, "temp");
|
||||
let mut destination = io::BufWriter::new(
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
@@ -108,7 +131,11 @@ impl RemoteStorage for LocalFs {
|
||||
})?,
|
||||
);
|
||||
|
||||
io::copy(&mut from, &mut destination)
|
||||
let from_size_bytes = from_size_bytes as u64;
|
||||
// Require to read 1 byte more than the expected to check later, that the stream and its size match.
|
||||
let mut buffer_to_read = from.take(from_size_bytes + 1);
|
||||
|
||||
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
@@ -117,6 +144,19 @@ impl RemoteStorage for LocalFs {
|
||||
)
|
||||
})?;
|
||||
|
||||
ensure!(
|
||||
bytes_read == from_size_bytes,
|
||||
"Provided stream has actual size {} fthat is smaller than the given stream size {}",
|
||||
bytes_read,
|
||||
from_size_bytes
|
||||
);
|
||||
|
||||
ensure!(
|
||||
buffer_to_read.read(&mut [0]).await? == 0,
|
||||
"Provided stream has bigger size than the given stream size {}",
|
||||
from_size_bytes
|
||||
);
|
||||
|
||||
destination.flush().await.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload (flush temp) file to the local storage at '{}'",
|
||||
@@ -132,14 +172,31 @@ impl RemoteStorage for LocalFs {
|
||||
target_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
if let Some(storage_metadata) = metadata {
|
||||
let storage_metadata_path = storage_metadata_path(&target_file_path);
|
||||
fs::write(
|
||||
&storage_metadata_path,
|
||||
serde_json::to_string(&storage_metadata.0)
|
||||
.context("Failed to serialize storage metadata as json")?,
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to write metadata to the local storage at '{}'",
|
||||
storage_metadata_path.display()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
from: &Self::RemoteObjectId,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
@@ -162,7 +219,8 @@ impl RemoteStorage for LocalFs {
|
||||
)
|
||||
})?;
|
||||
source.flush().await?;
|
||||
Ok(())
|
||||
|
||||
self.read_storage_metadata(&file_path).await
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
@@ -171,13 +229,13 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_range(
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
from: &Self::RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
||||
if let Some(end_exclusive) = end_exclusive {
|
||||
ensure!(
|
||||
end_exclusive > start_inclusive,
|
||||
@@ -186,7 +244,7 @@ impl RemoteStorage for LocalFs {
|
||||
end_exclusive
|
||||
);
|
||||
if start_inclusive == end_exclusive.saturating_sub(1) {
|
||||
return Ok(());
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
@@ -220,7 +278,8 @@ impl RemoteStorage for LocalFs {
|
||||
file_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
|
||||
self.read_storage_metadata(&file_path).await
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
@@ -229,7 +288,7 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
|
||||
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(path)?;
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
Ok(fs::remove_file(file_path).await?)
|
||||
@@ -242,6 +301,10 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_metadata_path(original_path: &Path) -> PathBuf {
|
||||
path_with_suffix_extension(original_path, "metadata")
|
||||
}
|
||||
|
||||
fn get_all_files<'a, P>(
|
||||
directory_path: P,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
|
||||
@@ -291,29 +354,30 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
|
||||
|
||||
#[cfg(test)]
|
||||
mod pure_tests {
|
||||
use crate::{
|
||||
layered_repository::metadata::METADATA_FILE_NAME,
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
use tempfile::tempdir;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn storage_path_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("storage_path_positive")?;
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root.clone(),
|
||||
working_directory: workdir.clone(),
|
||||
storage_root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("file_name");
|
||||
let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
|
||||
let local_path = workdir
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join("file_name");
|
||||
let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?);
|
||||
|
||||
assert_eq!(
|
||||
expected_path,
|
||||
storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
|
||||
"File paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
|
||||
storage.remote_object_id(&local_path).expect("Matching path should map to storage path normally"),
|
||||
"File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -323,7 +387,7 @@ mod pure_tests {
|
||||
fn storage_path_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
|
||||
match storage.storage_path(mismatching_path) {
|
||||
match storage.remote_object_id(mismatching_path) {
|
||||
Ok(wrong_path) => panic!(
|
||||
"Expected path '{}' to error, but got storage path: {:?}",
|
||||
mismatching_path.display(),
|
||||
@@ -333,16 +397,16 @@ mod pure_tests {
|
||||
}
|
||||
}
|
||||
|
||||
let repo_harness = RepoHarness::create("storage_path_negatives")?;
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root,
|
||||
working_directory: workdir.clone(),
|
||||
storage_root,
|
||||
};
|
||||
|
||||
let error_string = storage_path_error(&storage, &repo_harness.conf.workdir);
|
||||
let error_string = storage_path_error(&storage, &workdir);
|
||||
assert!(error_string.contains("does not belong to this storage"));
|
||||
assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap()));
|
||||
assert!(error_string.contains(workdir.to_str().unwrap()));
|
||||
|
||||
let mismatching_path_str = "/something/else";
|
||||
let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
|
||||
@@ -351,7 +415,7 @@ mod pure_tests {
|
||||
"Error should mention wrong path"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
|
||||
error_message.contains(workdir.to_str().unwrap()),
|
||||
"Error should mention server workdir"
|
||||
);
|
||||
assert!(error_message.contains("does not belong to this storage"));
|
||||
@@ -361,29 +425,28 @@ mod pure_tests {
|
||||
|
||||
#[test]
|
||||
fn local_path_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("local_path_positive")?;
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root.clone(),
|
||||
working_directory: workdir.clone(),
|
||||
storage_root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let name = "not a metadata";
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
|
||||
let local_path = workdir.join("timelines").join("some_timeline").join(name);
|
||||
assert_eq!(
|
||||
local_path,
|
||||
storage
|
||||
.local_path(
|
||||
&storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?)
|
||||
)
|
||||
.local_path(&storage_root.join(local_path.strip_prefix(&workdir)?))
|
||||
.expect("For a valid input, valid local path should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote delta file"
|
||||
);
|
||||
|
||||
let local_metadata_path = repo_harness
|
||||
.timeline_path(&TIMELINE_ID)
|
||||
.join(METADATA_FILE_NAME);
|
||||
let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
|
||||
let local_metadata_path = workdir
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join("metadata");
|
||||
let remote_metadata_path = storage.remote_object_id(&local_metadata_path)?;
|
||||
assert_eq!(
|
||||
local_metadata_path,
|
||||
storage
|
||||
@@ -409,11 +472,10 @@ mod pure_tests {
|
||||
}
|
||||
}
|
||||
|
||||
let repo_harness = RepoHarness::create("local_path_negatives")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root,
|
||||
working_directory: tempdir()?.path().to_owned(),
|
||||
storage_root,
|
||||
};
|
||||
|
||||
let totally_wrong_path = "wrong_wrong_wrong";
|
||||
@@ -425,16 +487,19 @@ mod pure_tests {
|
||||
|
||||
#[test]
|
||||
fn download_destination_matches_original_path() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
|
||||
let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let original_path = workdir
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join("some name");
|
||||
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let dummy_storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root,
|
||||
working_directory: workdir,
|
||||
storage_root,
|
||||
};
|
||||
|
||||
let storage_path = dummy_storage.storage_path(&original_path)?;
|
||||
let storage_path = dummy_storage.remote_object_id(&original_path)?;
|
||||
let download_destination = dummy_storage.local_path(&storage_path)?;
|
||||
|
||||
assert_eq!(
|
||||
@@ -449,23 +514,22 @@ mod pure_tests {
|
||||
#[cfg(test)]
|
||||
mod fs_tests {
|
||||
use super::*;
|
||||
use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
|
||||
use std::io::Write;
|
||||
use std::{collections::HashMap, io::Write};
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[tokio::test]
|
||||
async fn upload_file() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("upload_file")?;
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let storage = create_storage()?;
|
||||
|
||||
let source = create_file_for_upload(
|
||||
&storage.pageserver_workdir.join("whatever"),
|
||||
let (file, size) = create_file_for_upload(
|
||||
&storage.working_directory.join("whatever"),
|
||||
"whatever_contents",
|
||||
)
|
||||
.await?;
|
||||
let target_path = PathBuf::from("/").join("somewhere").join("else");
|
||||
match storage.upload(source, &target_path).await {
|
||||
match storage.upload(file, size, &target_path, None).await {
|
||||
Ok(()) => panic!("Should not allow storing files with wrong target path"),
|
||||
Err(e) => {
|
||||
let message = format!("{:?}", e);
|
||||
@@ -475,14 +539,14 @@ mod fs_tests {
|
||||
}
|
||||
assert!(storage.list().await?.is_empty());
|
||||
|
||||
let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?;
|
||||
let target_path_1 = upload_dummy_file(&workdir, &storage, "upload_1", None).await?;
|
||||
assert_eq!(
|
||||
storage.list().await?,
|
||||
vec![target_path_1.clone()],
|
||||
"Should list a single file after first upload"
|
||||
);
|
||||
|
||||
let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?;
|
||||
let target_path_2 = upload_dummy_file(&workdir, &storage, "upload_2", None).await?;
|
||||
assert_eq!(
|
||||
list_files_sorted(&storage).await?,
|
||||
vec![target_path_1.clone(), target_path_2.clone()],
|
||||
@@ -493,22 +557,25 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
fn create_storage() -> anyhow::Result<LocalFs> {
|
||||
let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned()));
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?;
|
||||
Ok(storage)
|
||||
LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_file")?;
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
|
||||
let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
storage.download(&upload_target, &mut content_bytes).await?;
|
||||
content_bytes.flush().await?;
|
||||
let metadata = storage.download(&upload_target, &mut content_bytes).await?;
|
||||
assert!(
|
||||
metadata.is_none(),
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
|
||||
content_bytes.flush().await?;
|
||||
let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
|
||||
assert_eq!(
|
||||
dummy_contents(upload_name),
|
||||
@@ -530,15 +597,20 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_file_range_positive")?;
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
|
||||
let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
storage
|
||||
.download_range(&upload_target, 0, None, &mut full_range_bytes)
|
||||
let metadata = storage
|
||||
.download_byte_range(&upload_target, 0, None, &mut full_range_bytes)
|
||||
.await?;
|
||||
assert!(
|
||||
metadata.is_none(),
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
full_range_bytes.flush().await?;
|
||||
assert_eq!(
|
||||
dummy_contents(upload_name),
|
||||
@@ -548,14 +620,18 @@ mod fs_tests {
|
||||
|
||||
let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let same_byte = 1_000_000_000;
|
||||
storage
|
||||
.download_range(
|
||||
let metadata = storage
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
same_byte,
|
||||
Some(same_byte + 1), // exclusive end
|
||||
&mut zero_range_bytes,
|
||||
)
|
||||
.await?;
|
||||
assert!(
|
||||
metadata.is_none(),
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
zero_range_bytes.flush().await?;
|
||||
assert!(
|
||||
zero_range_bytes.into_inner().into_inner().is_empty(),
|
||||
@@ -566,14 +642,19 @@ mod fs_tests {
|
||||
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
||||
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
storage
|
||||
.download_range(
|
||||
let metadata = storage
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
0,
|
||||
Some(first_part_local.len() as u64),
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
assert!(
|
||||
metadata.is_none(),
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
|
||||
first_part_remote.flush().await?;
|
||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
@@ -583,14 +664,19 @@ mod fs_tests {
|
||||
);
|
||||
|
||||
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
storage
|
||||
.download_range(
|
||||
let metadata = storage
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
first_part_local.len() as u64,
|
||||
Some((first_part_local.len() + second_part_local.len()) as u64),
|
||||
&mut second_part_remote,
|
||||
)
|
||||
.await?;
|
||||
assert!(
|
||||
metadata.is_none(),
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
|
||||
second_part_remote.flush().await?;
|
||||
let second_part_remote = second_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
@@ -604,16 +690,17 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_negative() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_file_range_negative")?;
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
|
||||
let start = 10000;
|
||||
let end = 234;
|
||||
assert!(start > end, "Should test an incorrect range");
|
||||
match storage
|
||||
.download_range(&upload_target, start, Some(end), &mut io::sink())
|
||||
.download_byte_range(&upload_target, start, Some(end), &mut io::sink())
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading wrong ranges"),
|
||||
@@ -627,7 +714,7 @@ mod fs_tests {
|
||||
|
||||
let non_existing_path = PathBuf::from("somewhere").join("else");
|
||||
match storage
|
||||
.download_range(&non_existing_path, 1, Some(3), &mut io::sink())
|
||||
.download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink())
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
|
||||
@@ -642,10 +729,11 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_file() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("delete_file")?;
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
|
||||
storage.delete(&upload_target).await?;
|
||||
assert!(storage.list().await?.is_empty());
|
||||
@@ -661,31 +749,85 @@ mod fs_tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn upload_dummy_file(
|
||||
harness: &RepoHarness,
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?;
|
||||
let storage_path = storage.root.join(relative_timeline_path).join(name);
|
||||
storage
|
||||
.upload(
|
||||
create_file_for_upload(
|
||||
&storage.pageserver_workdir.join(name),
|
||||
&dummy_contents(name),
|
||||
)
|
||||
.await?,
|
||||
&storage_path,
|
||||
#[tokio::test]
|
||||
async fn file_with_metadata() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let metadata = StorageMetadata(HashMap::from([
|
||||
("one".to_string(), "1".to_string()),
|
||||
("two".to_string(), "2".to_string()),
|
||||
]));
|
||||
let upload_target =
|
||||
upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;
|
||||
|
||||
let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?;
|
||||
|
||||
content_bytes.flush().await?;
|
||||
let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
|
||||
assert_eq!(
|
||||
dummy_contents(upload_name),
|
||||
contents,
|
||||
"We should upload and download the same contents"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
full_download_metadata.as_ref(),
|
||||
Some(&metadata),
|
||||
"We should get the same metadata back for full download"
|
||||
);
|
||||
|
||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||
let (first_part_local, _) = uploaded_bytes.split_at(3);
|
||||
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let partial_download_metadata = storage
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
0,
|
||||
Some(first_part_local.len() as u64),
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
first_part_remote.flush().await?;
|
||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
first_part_local,
|
||||
first_part_remote.as_slice(),
|
||||
"First part bytes should be returned when requested"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
partial_download_metadata.as_ref(),
|
||||
Some(&metadata),
|
||||
"We should get the same metadata back for partial download"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn upload_dummy_file(
|
||||
workdir: &Path,
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let timeline_path = workdir.join("timelines").join("some_timeline");
|
||||
let relative_timeline_path = timeline_path.strip_prefix(&workdir)?;
|
||||
let storage_path = storage.storage_root.join(relative_timeline_path).join(name);
|
||||
|
||||
let from_path = storage.working_directory.join(name);
|
||||
let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?;
|
||||
storage.upload(file, size, &storage_path, metadata).await?;
|
||||
Ok(storage_path)
|
||||
}
|
||||
|
||||
async fn create_file_for_upload(
|
||||
path: &Path,
|
||||
contents: &str,
|
||||
) -> anyhow::Result<io::BufReader<fs::File>> {
|
||||
) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
|
||||
std::fs::create_dir_all(path.parent().unwrap())?;
|
||||
let mut file_for_writing = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
@@ -693,8 +835,10 @@ mod fs_tests {
|
||||
.open(path)?;
|
||||
write!(file_for_writing, "{}", contents)?;
|
||||
drop(file_for_writing);
|
||||
Ok(io::BufReader::new(
|
||||
fs::OpenOptions::new().read(true).open(&path).await?,
|
||||
let file_size = path.metadata()?.len() as usize;
|
||||
Ok((
|
||||
io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
|
||||
file_size,
|
||||
))
|
||||
}
|
||||
|
||||
465
libs/remote_storage/src/s3_bucket.rs
Normal file
465
libs/remote_storage/src/s3_bucket.rs
Normal file
@@ -0,0 +1,465 @@
|
||||
//! AWS S3 storage wrapper around `rusoto` library.
|
||||
//!
|
||||
//! Respects `prefix_in_bucket` property from [`S3Config`],
|
||||
//! allowing multiple api users to independently work with the same S3 bucket, if
|
||||
//! their bucket prefixes are both specified and different.
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Context;
|
||||
use rusoto_core::{
|
||||
credential::{InstanceMetadataProvider, StaticProvider},
|
||||
HttpClient, Region,
|
||||
};
|
||||
use rusoto_s3::{
|
||||
DeleteObjectRequest, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, S3Client,
|
||||
StreamingBody, S3,
|
||||
};
|
||||
use tokio::{io, sync::Semaphore};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{strip_path_prefix, RemoteStorage, S3Config};
|
||||
|
||||
use super::StorageMetadata;
|
||||
|
||||
const S3_PREFIX_SEPARATOR: char = '/';
|
||||
|
||||
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
pub struct S3ObjectKey(String);
|
||||
|
||||
impl S3ObjectKey {
|
||||
fn key(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
|
||||
fn download_destination(&self, workdir: &Path, prefix_to_strip: Option<&str>) -> PathBuf {
|
||||
let path_without_prefix = match prefix_to_strip {
|
||||
Some(prefix) => self.0.strip_prefix(prefix).unwrap_or_else(|| {
|
||||
panic!(
|
||||
"Could not strip prefix '{}' from S3 object key '{}'",
|
||||
prefix, self.0
|
||||
)
|
||||
}),
|
||||
None => &self.0,
|
||||
};
|
||||
|
||||
workdir.join(
|
||||
path_without_prefix
|
||||
.split(S3_PREFIX_SEPARATOR)
|
||||
.collect::<PathBuf>(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// AWS S3 storage.
|
||||
pub struct S3Bucket {
|
||||
workdir: PathBuf,
|
||||
client: S3Client,
|
||||
bucket_name: String,
|
||||
prefix_in_bucket: Option<String>,
|
||||
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
||||
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
|
||||
// The helps to ensure we don't exceed the thresholds.
|
||||
concurrency_limiter: Semaphore,
|
||||
}
|
||||
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config, workdir: PathBuf) -> anyhow::Result<Self> {
|
||||
debug!(
|
||||
"Creating s3 remote storage for S3 bucket {}",
|
||||
aws_config.bucket_name
|
||||
);
|
||||
let region = match aws_config.endpoint.clone() {
|
||||
Some(custom_endpoint) => Region::Custom {
|
||||
name: aws_config.bucket_region.clone(),
|
||||
endpoint: custom_endpoint,
|
||||
},
|
||||
None => aws_config
|
||||
.bucket_region
|
||||
.parse::<Region>()
|
||||
.context("Failed to parse the s3 region from config")?,
|
||||
};
|
||||
let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?;
|
||||
|
||||
let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok();
|
||||
let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok();
|
||||
|
||||
let client = if access_key_id.is_none() && secret_access_key.is_none() {
|
||||
debug!("Using IAM-based AWS access");
|
||||
S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
|
||||
} else {
|
||||
debug!("Using credentials-based AWS access");
|
||||
S3Client::new_with(
|
||||
request_dispatcher,
|
||||
StaticProvider::new_minimal(
|
||||
access_key_id.unwrap_or_default(),
|
||||
secret_access_key.unwrap_or_default(),
|
||||
),
|
||||
region,
|
||||
)
|
||||
};
|
||||
|
||||
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
|
||||
let mut prefix = prefix;
|
||||
while prefix.starts_with(S3_PREFIX_SEPARATOR) {
|
||||
prefix = &prefix[1..]
|
||||
}
|
||||
|
||||
let mut prefix = prefix.to_string();
|
||||
while prefix.ends_with(S3_PREFIX_SEPARATOR) {
|
||||
prefix.pop();
|
||||
}
|
||||
prefix
|
||||
});
|
||||
|
||||
Ok(Self {
|
||||
client,
|
||||
workdir,
|
||||
bucket_name: aws_config.bucket_name.clone(),
|
||||
prefix_in_bucket,
|
||||
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
type RemoteObjectId = S3ObjectKey;
|
||||
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId> {
|
||||
let relative_path = strip_path_prefix(&self.workdir, local_path)?;
|
||||
let mut key = self.prefix_in_bucket.clone().unwrap_or_default();
|
||||
for segment in relative_path {
|
||||
key.push(S3_PREFIX_SEPARATOR);
|
||||
key.push_str(&segment.to_string_lossy());
|
||||
}
|
||||
Ok(S3ObjectKey(key))
|
||||
}
|
||||
|
||||
fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result<PathBuf> {
|
||||
Ok(storage_path.download_destination(&self.workdir, self.prefix_in_bucket.as_deref()))
|
||||
}
|
||||
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||
let mut document_keys = Vec::new();
|
||||
|
||||
let mut continuation_token = None;
|
||||
loop {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 list")?;
|
||||
let fetch_response = self
|
||||
.client
|
||||
.list_objects_v2(ListObjectsV2Request {
|
||||
bucket: self.bucket_name.clone(),
|
||||
prefix: self.prefix_in_bucket.clone(),
|
||||
continuation_token,
|
||||
..ListObjectsV2Request::default()
|
||||
})
|
||||
.await?;
|
||||
document_keys.extend(
|
||||
fetch_response
|
||||
.contents
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter_map(|o| Some(S3ObjectKey(o.key?))),
|
||||
);
|
||||
|
||||
match fetch_response.continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
Ok(document_keys)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &Self::RemoteObjectId,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 upload")?;
|
||||
self.client
|
||||
.put_object(PutObjectRequest {
|
||||
body: Some(StreamingBody::new_with_size(
|
||||
ReaderStream::new(from),
|
||||
from_size_bytes,
|
||||
)),
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: to.key().to_owned(),
|
||||
metadata: metadata.map(|m| m.0),
|
||||
..PutObjectRequest::default()
|
||||
})
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 download")?;
|
||||
let object_output = self
|
||||
.client
|
||||
.get_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: from.key().to_owned(),
|
||||
..GetObjectRequest::default()
|
||||
})
|
||||
.await?;
|
||||
|
||||
if let Some(body) = object_output.body {
|
||||
let mut from = io::BufReader::new(body.into_async_read());
|
||||
io::copy(&mut from, to).await?;
|
||||
}
|
||||
|
||||
Ok(object_output.metadata.map(StorageMetadata))
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
||||
// S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
|
||||
// and needs both ends to be exclusive
|
||||
let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
|
||||
let range = Some(match end_inclusive {
|
||||
Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
|
||||
None => format!("bytes={}-", start_inclusive),
|
||||
});
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 range download")?;
|
||||
let object_output = self
|
||||
.client
|
||||
.get_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: from.key().to_owned(),
|
||||
range,
|
||||
..GetObjectRequest::default()
|
||||
})
|
||||
.await?;
|
||||
|
||||
if let Some(body) = object_output.body {
|
||||
let mut from = io::BufReader::new(body.into_async_read());
|
||||
io::copy(&mut from, to).await?;
|
||||
}
|
||||
|
||||
Ok(object_output.metadata.map(StorageMetadata))
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 delete")?;
|
||||
self.client
|
||||
.delete_object(DeleteObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: path.key().to_owned(),
|
||||
..DeleteObjectRequest::default()
|
||||
})
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tempfile::tempdir;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn download_destination() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let local_path = workdir.join("one").join("two").join("test_name");
|
||||
let relative_path = local_path.strip_prefix(&workdir)?;
|
||||
|
||||
let key = S3ObjectKey(format!(
|
||||
"{}{}",
|
||||
S3_PREFIX_SEPARATOR,
|
||||
relative_path
|
||||
.iter()
|
||||
.map(|segment| segment.to_str().unwrap())
|
||||
.collect::<Vec<_>>()
|
||||
.join(&S3_PREFIX_SEPARATOR.to_string()),
|
||||
));
|
||||
|
||||
assert_eq!(
|
||||
local_path,
|
||||
key.download_destination(&workdir, None),
|
||||
"Download destination should consist of s3 path joined with the workdir prefix"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_path_positive() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
let segment_1 = "matching";
|
||||
let segment_2 = "file";
|
||||
let local_path = &workdir.join(segment_1).join(segment_2);
|
||||
|
||||
let storage = dummy_storage(workdir);
|
||||
|
||||
let expected_key = S3ObjectKey(format!(
|
||||
"{}{S3_PREFIX_SEPARATOR}{segment_1}{S3_PREFIX_SEPARATOR}{segment_2}",
|
||||
storage.prefix_in_bucket.as_deref().unwrap_or_default(),
|
||||
));
|
||||
|
||||
let actual_key = storage
|
||||
.remote_object_id(local_path)
|
||||
.expect("Matching path should map to S3 path normally");
|
||||
assert_eq!(
|
||||
expected_key,
|
||||
actual_key,
|
||||
"S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_path_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String {
|
||||
match storage.remote_object_id(mismatching_path) {
|
||||
Ok(wrong_key) => panic!(
|
||||
"Expected path '{}' to error, but got S3 key: {:?}",
|
||||
mismatching_path.display(),
|
||||
wrong_key,
|
||||
),
|
||||
Err(e) => e.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let storage = dummy_storage(workdir.clone());
|
||||
|
||||
let error_message = storage_path_error(&storage, &workdir);
|
||||
assert!(
|
||||
error_message.contains("Prefix and the path are equal"),
|
||||
"Message '{}' does not contain the required string",
|
||||
error_message
|
||||
);
|
||||
|
||||
let mismatching_path = PathBuf::from("somewhere").join("else");
|
||||
let error_message = storage_path_error(&storage, &mismatching_path);
|
||||
assert!(
|
||||
error_message.contains(mismatching_path.to_str().unwrap()),
|
||||
"Error should mention wrong path"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains(workdir.to_str().unwrap()),
|
||||
"Error should mention server workdir"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains("is not prefixed with"),
|
||||
"Message '{}' does not contain a required string",
|
||||
error_message
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_path_positive() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let storage = dummy_storage(workdir.clone());
|
||||
let timeline_dir = workdir.join("timelines").join("test_timeline");
|
||||
let relative_timeline_path = timeline_dir.strip_prefix(&workdir)?;
|
||||
|
||||
let s3_key = create_s3_key(
|
||||
&relative_timeline_path.join("not a metadata"),
|
||||
storage.prefix_in_bucket.as_deref(),
|
||||
);
|
||||
assert_eq!(
|
||||
s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()),
|
||||
storage
|
||||
.local_path(&s3_key)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote delta file"
|
||||
);
|
||||
|
||||
let s3_key = create_s3_key(
|
||||
&relative_timeline_path.join("metadata"),
|
||||
storage.prefix_in_bucket.as_deref(),
|
||||
);
|
||||
assert_eq!(
|
||||
s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()),
|
||||
storage
|
||||
.local_path(&s3_key)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote metadata file"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_destination_matches_original_path() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
let original_path = workdir
|
||||
.join("timelines")
|
||||
.join("some_timeline")
|
||||
.join("some name");
|
||||
|
||||
let dummy_storage = dummy_storage(workdir);
|
||||
|
||||
let key = dummy_storage.remote_object_id(&original_path)?;
|
||||
let download_destination = dummy_storage.local_path(&key)?;
|
||||
|
||||
assert_eq!(
|
||||
original_path, download_destination,
|
||||
"'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn dummy_storage(workdir: PathBuf) -> S3Bucket {
|
||||
S3Bucket {
|
||||
workdir,
|
||||
client: S3Client::new("us-east-1".parse().unwrap()),
|
||||
bucket_name: "dummy-bucket".to_string(),
|
||||
prefix_in_bucket: Some("dummy_prefix/".to_string()),
|
||||
concurrency_limiter: Semaphore::new(1),
|
||||
}
|
||||
}
|
||||
|
||||
fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> S3ObjectKey {
|
||||
S3ObjectKey(relative_file_path.iter().fold(
|
||||
prefix.unwrap_or_default().to_string(),
|
||||
|mut path_string, segment| {
|
||||
path_string.push(S3_PREFIX_SEPARATOR);
|
||||
path_string.push_str(segment.to_str().unwrap());
|
||||
path_string
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
[package]
|
||||
name = "zenith_utils"
|
||||
name = "utils"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
@@ -10,35 +10,35 @@ bytes = "1.0.1"
|
||||
hyper = { version = "0.14.7", features = ["full"] }
|
||||
lazy_static = "1.4.0"
|
||||
pin-project-lite = "0.2.7"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
routerify = "3"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
thiserror = "1.0"
|
||||
tokio = { version = "1.11", features = ["macros"]}
|
||||
tokio = { version = "1.17", features = ["macros"]}
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
nix = "0.23.0"
|
||||
signal-hook = "0.3.10"
|
||||
rand = "0.8.3"
|
||||
jsonwebtoken = "7"
|
||||
jsonwebtoken = "8"
|
||||
hex = { version = "0.4.3", features = ["serde"] }
|
||||
rustls = "0.19.1"
|
||||
rustls-split = "0.2.1"
|
||||
rustls = "0.20.2"
|
||||
rustls-split = "0.3.0"
|
||||
git-version = "0.3.5"
|
||||
serde_with = "1.12.0"
|
||||
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
metrics = { path = "../metrics" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
byteorder = "1.4.3"
|
||||
bytes = "1.0.1"
|
||||
hex-literal = "0.3"
|
||||
tempfile = "3.2"
|
||||
webpki = "0.21"
|
||||
criterion = "0.3"
|
||||
rustls-pemfile = "0.2.1"
|
||||
|
||||
[[bench]]
|
||||
name = "benchmarks"
|
||||
@@ -1,7 +1,7 @@
|
||||
#![allow(unused)]
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use zenith_utils::zid;
|
||||
use utils::zid;
|
||||
|
||||
pub fn bench_zid_stringify(c: &mut Criterion) {
|
||||
// Can only use public methods.
|
||||
21
libs/utils/scripts/restore_from_wal.sh
Executable file
21
libs/utils/scripts/restore_from_wal.sh
Executable file
@@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
PG_BIN=$1
|
||||
WAL_PATH=$2
|
||||
DATA_DIR=$3
|
||||
PORT=$4
|
||||
SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
|
||||
rm -fr $DATA_DIR
|
||||
env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID
|
||||
echo port=$PORT >> $DATA_DIR/postgresql.conf
|
||||
REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
|
||||
declare -i WAL_SIZE=$REDO_POS+114
|
||||
$PG_BIN/pg_ctl -D $DATA_DIR -l logfile start
|
||||
$PG_BIN/pg_ctl -D $DATA_DIR -l logfile stop -m immediate
|
||||
cp $DATA_DIR/pg_wal/000000010000000000000001 .
|
||||
cp $WAL_PATH/* $DATA_DIR/pg_wal/
|
||||
if [ -f $DATA_DIR/pg_wal/*.partial ]
|
||||
then
|
||||
(cd $DATA_DIR/pg_wal ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done)
|
||||
fi
|
||||
dd if=000000010000000000000001 of=$DATA_DIR/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
|
||||
rm -f 000000010000000000000001
|
||||
20
libs/utils/scripts/restore_from_wal_archive.sh
Executable file
20
libs/utils/scripts/restore_from_wal_archive.sh
Executable file
@@ -0,0 +1,20 @@
|
||||
PG_BIN=$1
|
||||
WAL_PATH=$2
|
||||
DATA_DIR=$3
|
||||
PORT=$4
|
||||
SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
|
||||
rm -fr $DATA_DIR /tmp/pg_wals
|
||||
mkdir /tmp/pg_wals
|
||||
env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID
|
||||
echo port=$PORT >> $DATA_DIR/postgresql.conf
|
||||
REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
|
||||
declare -i WAL_SIZE=$REDO_POS+114
|
||||
cp $WAL_PATH/* /tmp/pg_wals
|
||||
if [ -f $DATA_DIR/pg_wal/*.partial ]
|
||||
then
|
||||
(cd /tmp/pg_wals ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done)
|
||||
fi
|
||||
dd if=$DATA_DIR/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
|
||||
echo > $DATA_DIR/recovery.signal
|
||||
rm -f $DATA_DIR/pg_wal/*
|
||||
echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> $DATA_DIR/postgresql.conf
|
||||
@@ -5,7 +5,7 @@
|
||||
/// For example, to calculate the smallest value among some integers:
|
||||
///
|
||||
/// ```
|
||||
/// use zenith_utils::accum::Accum;
|
||||
/// use utils::accum::Accum;
|
||||
///
|
||||
/// let values = [1, 2, 3];
|
||||
///
|
||||
@@ -1,8 +1,6 @@
|
||||
// For details about authentication see docs/authentication.md
|
||||
// TODO there are two issues for our use case in jsonwebtoken library which will be resolved in next release
|
||||
// The first one is that there is no way to disable expiration claim, but it can be excluded from validation, so use this as a workaround for now.
|
||||
// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/190
|
||||
// The second one is that we wanted to use ed25519 keys, but they are also not supported until next version. So we go with RSA keys for now.
|
||||
//
|
||||
// TODO: use ed25519 keys
|
||||
// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162
|
||||
|
||||
use serde;
|
||||
@@ -59,19 +57,19 @@ pub fn check_permission(claims: &Claims, tenantid: Option<ZTenantId>) -> Result<
|
||||
}
|
||||
|
||||
pub struct JwtAuth {
|
||||
decoding_key: DecodingKey<'static>,
|
||||
decoding_key: DecodingKey,
|
||||
validation: Validation,
|
||||
}
|
||||
|
||||
impl JwtAuth {
|
||||
pub fn new(decoding_key: DecodingKey<'_>) -> Self {
|
||||
pub fn new(decoding_key: DecodingKey) -> Self {
|
||||
let mut validation = Validation::new(JWT_ALGORITHM);
|
||||
// The default 'required_spec_claims' is 'exp'. But we don't want to require
|
||||
// expiration.
|
||||
validation.required_spec_claims = [].into();
|
||||
Self {
|
||||
decoding_key: decoding_key.into_static(),
|
||||
validation: Validation {
|
||||
algorithms: vec![JWT_ALGORITHM],
|
||||
validate_exp: false,
|
||||
..Default::default()
|
||||
},
|
||||
decoding_key,
|
||||
validation,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,12 +5,11 @@ use anyhow::anyhow;
|
||||
use hyper::header::AUTHORIZATION;
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
|
||||
use lazy_static::lazy_static;
|
||||
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::RequestInfo;
|
||||
use routerify::{Middleware, Router, RouterBuilder, RouterService};
|
||||
use tracing::info;
|
||||
use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter};
|
||||
use zenith_metrics::{Encoder, TextEncoder};
|
||||
|
||||
use std::future::Future;
|
||||
use std::net::TcpListener;
|
||||
@@ -19,7 +18,7 @@ use super::error::ApiError;
|
||||
|
||||
lazy_static! {
|
||||
static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!(
|
||||
new_common_metric_name("serve_metrics_count"),
|
||||
"libmetrics_metric_handler_requests_total",
|
||||
"Number of metric requests made"
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
@@ -36,7 +35,7 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
|
||||
let mut buffer = vec![];
|
||||
let encoder = TextEncoder::new();
|
||||
|
||||
let metrics = zenith_metrics::gather();
|
||||
let metrics = metrics::gather();
|
||||
encoder.encode(&metrics, &mut buffer).unwrap();
|
||||
|
||||
let response = Response::builder()
|
||||
@@ -160,7 +159,7 @@ pub fn serve_thread_main<S>(
|
||||
where
|
||||
S: Future<Output = ()> + Send + Sync,
|
||||
{
|
||||
info!("Starting a http endpoint at {}", listener.local_addr()?);
|
||||
info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
|
||||
|
||||
// Create a Service from the router above to handle incoming requests.
|
||||
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
|
||||
@@ -17,6 +17,9 @@ pub enum ApiError {
|
||||
#[error("NotFound: {0}")]
|
||||
NotFound(String),
|
||||
|
||||
#[error("Conflict: {0}")]
|
||||
Conflict(String),
|
||||
|
||||
#[error(transparent)]
|
||||
InternalServerError(#[from] anyhow::Error),
|
||||
}
|
||||
@@ -42,6 +45,9 @@ impl ApiError {
|
||||
ApiError::NotFound(_) => {
|
||||
HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::NOT_FOUND)
|
||||
}
|
||||
ApiError::Conflict(_) => {
|
||||
HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::CONFLICT)
|
||||
}
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
@@ -10,8 +10,8 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
|
||||
let whole_body = hyper::body::aggregate(request.body_mut())
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
Ok(serde_json::from_reader(whole_body.reader())
|
||||
.map_err(|err| ApiError::BadRequest(format!("Failed to parse json request {}", err)))?)
|
||||
serde_json::from_reader(whole_body.reader())
|
||||
.map_err(|err| ApiError::BadRequest(format!("Failed to parse json request {}", err)))
|
||||
}
|
||||
|
||||
pub fn json_response<T: Serialize>(
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::error::ApiError;
|
||||
use hyper::{Body, Request};
|
||||
use hyper::{body::HttpBody, Body, Request};
|
||||
use routerify::ext::RequestExt;
|
||||
|
||||
pub fn get_request_param<'a>(
|
||||
@@ -31,3 +31,10 @@ pub fn parse_request_param<T: FromStr>(
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
|
||||
match request.body_mut().data().await {
|
||||
Some(_) => Err(ApiError::BadRequest("Unexpected request body".into())),
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user