mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-03 02:30:37 +00:00
Compare commits
69 Commits
partial_im
...
page_cache
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
47d29613a7 | ||
|
|
bc5ec43056 | ||
|
|
b237feedab | ||
|
|
4d1e48f3b9 | ||
|
|
7576b18b14 | ||
|
|
6b49b370fc | ||
|
|
91411c415a | ||
|
|
c67cf34040 | ||
|
|
8fbe437768 | ||
|
|
989d78aac8 | ||
|
|
7ca72578f9 | ||
|
|
41550ec8bf | ||
|
|
0cd2d91b9d | ||
|
|
546e9bdbec | ||
|
|
59bc7e67e0 | ||
|
|
2418e72649 | ||
|
|
80746b1c7a | ||
|
|
129f7c82b7 | ||
|
|
0ec5ddea0b | ||
|
|
c4ee62d427 | ||
|
|
c709354579 | ||
|
|
5d6553d41d | ||
|
|
f03b7c3458 | ||
|
|
9c24de254f | ||
|
|
538876650a | ||
|
|
500239176c | ||
|
|
ee64a6b80b | ||
|
|
a13b486943 | ||
|
|
9fe4548e13 | ||
|
|
14c623b254 | ||
|
|
ebf54b0de0 | ||
|
|
09dda35dac | ||
|
|
6ace79345d | ||
|
|
771e61425e | ||
|
|
93775f6ca7 | ||
|
|
6d0dacc4ce | ||
|
|
e5e40a31f4 | ||
|
|
676c63c329 | ||
|
|
47366522a8 | ||
|
|
db26bc49cc | ||
|
|
e520293090 | ||
|
|
241e549757 | ||
|
|
34bea270f0 | ||
|
|
13f0e7a5b4 | ||
|
|
3e35f10adc | ||
|
|
3be3bb7730 | ||
|
|
01d2c52c82 | ||
|
|
9f79e7edea | ||
|
|
a22165d41e | ||
|
|
725be60bb7 | ||
|
|
e516c376d6 | ||
|
|
8e51c27e1a | ||
|
|
9e1eb69d55 | ||
|
|
687ba81366 | ||
|
|
47bae68a2e | ||
|
|
e8b195acb7 | ||
|
|
254cb7dc4f | ||
|
|
ed85d97f17 | ||
|
|
4a216c5f7f | ||
|
|
c5a428a61a | ||
|
|
ff8c481777 | ||
|
|
f25dd75be9 | ||
|
|
b99bed510d | ||
|
|
580584c8fc | ||
|
|
d823e84ed5 | ||
|
|
231dfbaed6 | ||
|
|
5cf53786f9 | ||
|
|
9b9bbad462 | ||
|
|
537b2c1ae6 |
2
.github/PULL_REQUEST_TEMPLATE/release-pr.md
vendored
2
.github/PULL_REQUEST_TEMPLATE/release-pr.md
vendored
@@ -10,7 +10,7 @@
|
|||||||
<!-- List everything that should be done **before** release, any issues / setting changes / etc -->
|
<!-- List everything that should be done **before** release, any issues / setting changes / etc -->
|
||||||
|
|
||||||
### Checklist after release
|
### Checklist after release
|
||||||
- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/120/files))
|
- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files))
|
||||||
- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
|
- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
|
||||||
- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
|
- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
|
||||||
- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
|
- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
|
||||||
|
|||||||
4
.github/actions/allure-report/action.yml
vendored
4
.github/actions/allure-report/action.yml
vendored
@@ -47,7 +47,7 @@ runs:
|
|||||||
else
|
else
|
||||||
key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-")
|
key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-")
|
||||||
fi
|
fi
|
||||||
echo "::set-output name=KEY::${key}"
|
echo "KEY=${key}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
- uses: actions/setup-java@v3
|
- uses: actions/setup-java@v3
|
||||||
if: ${{ inputs.action == 'generate' }}
|
if: ${{ inputs.action == 'generate' }}
|
||||||
@@ -186,7 +186,7 @@ runs:
|
|||||||
aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
|
aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
|
||||||
|
|
||||||
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
|
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
|
||||||
echo "::set-output name=report-url::${REPORT_URL}"
|
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
- name: Release Allure lock
|
- name: Release Allure lock
|
||||||
if: ${{ inputs.action == 'generate' && always() }}
|
if: ${{ inputs.action == 'generate' && always() }}
|
||||||
|
|||||||
4
.github/actions/download/action.yml
vendored
4
.github/actions/download/action.yml
vendored
@@ -34,7 +34,7 @@ runs:
|
|||||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||||
if [ -z "${S3_KEY}" ]; then
|
if [ -z "${S3_KEY}" ]; then
|
||||||
if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
|
if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
|
||||||
echo '::set-output name=SKIPPED::true'
|
echo 'SKIPPED=true' >> $GITHUB_OUTPUT
|
||||||
exit 0
|
exit 0
|
||||||
else
|
else
|
||||||
echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
||||||
@@ -42,7 +42,7 @@ runs:
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo '::set-output name=SKIPPED::false'
|
echo 'SKIPPED=false' >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
mkdir -p $(dirname $ARCHIVE)
|
mkdir -p $(dirname $ARCHIVE)
|
||||||
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} ${ARCHIVE}
|
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} ${ARCHIVE}
|
||||||
|
|||||||
@@ -41,8 +41,8 @@ runs:
|
|||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
echo "::set-output name=api_host::${API_HOST}"
|
echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
|
||||||
echo "::set-output name=region_id::${REGION_ID}"
|
echo "region_id=${REGION_ID}" >> $GITHUB_OUTPUT
|
||||||
env:
|
env:
|
||||||
ENVIRONMENT: ${{ inputs.environment }}
|
ENVIRONMENT: ${{ inputs.environment }}
|
||||||
REGION_ID: ${{ inputs.region_id }}
|
REGION_ID: ${{ inputs.region_id }}
|
||||||
@@ -72,10 +72,10 @@ runs:
|
|||||||
|
|
||||||
dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main
|
dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main
|
||||||
echo "::add-mask::${dsn}"
|
echo "::add-mask::${dsn}"
|
||||||
echo "::set-output name=dsn::${dsn}"
|
echo "dsn=${dsn}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
project_id=$(echo $project | jq --raw-output '.id')
|
project_id=$(echo $project | jq --raw-output '.id')
|
||||||
echo "::set-output name=project_id::${project_id}"
|
echo "project_id=${project_id}" >> $GITHUB_OUTPUT
|
||||||
env:
|
env:
|
||||||
API_KEY: ${{ inputs.api_key }}
|
API_KEY: ${{ inputs.api_key }}
|
||||||
API_HOST: ${{ steps.parse-input.outputs.api_host }}
|
API_HOST: ${{ steps.parse-input.outputs.api_host }}
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ runs:
|
|||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
echo "::set-output name=api_host::${API_HOST}"
|
echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
|
||||||
env:
|
env:
|
||||||
ENVIRONMENT: ${{ inputs.environment }}
|
ENVIRONMENT: ${{ inputs.environment }}
|
||||||
|
|
||||||
|
|||||||
3
.github/ansible/.gitignore
vendored
3
.github/ansible/.gitignore
vendored
@@ -2,3 +2,6 @@ zenith_install.tar.gz
|
|||||||
.zenith_current_version
|
.zenith_current_version
|
||||||
neon_install.tar.gz
|
neon_install.tar.gz
|
||||||
.neon_current_version
|
.neon_current_version
|
||||||
|
|
||||||
|
collections/*
|
||||||
|
!collections/.keep
|
||||||
|
|||||||
1
.github/ansible/ansible.cfg
vendored
1
.github/ansible/ansible.cfg
vendored
@@ -3,6 +3,7 @@
|
|||||||
localhost_warning = False
|
localhost_warning = False
|
||||||
host_key_checking = False
|
host_key_checking = False
|
||||||
timeout = 30
|
timeout = 30
|
||||||
|
collections_paths = ./collections
|
||||||
|
|
||||||
[ssh_connection]
|
[ssh_connection]
|
||||||
ssh_args = -F ./ansible.ssh.cfg
|
ssh_args = -F ./ansible.ssh.cfg
|
||||||
|
|||||||
0
.github/ansible/collections/.keep
vendored
Normal file
0
.github/ansible/collections/.keep
vendored
Normal file
41
.github/ansible/deploy.yaml
vendored
41
.github/ansible/deploy.yaml
vendored
@@ -1,7 +1,7 @@
|
|||||||
- name: Upload Neon binaries
|
- name: Upload Neon binaries
|
||||||
hosts: storage
|
hosts: storage
|
||||||
gather_facts: False
|
gather_facts: False
|
||||||
remote_user: admin
|
remote_user: "{{ remote_user }}"
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
|
|
||||||
@@ -14,7 +14,8 @@
|
|||||||
- safekeeper
|
- safekeeper
|
||||||
|
|
||||||
- name: inform about versions
|
- name: inform about versions
|
||||||
debug: msg="Version to deploy - {{ current_version }}"
|
debug:
|
||||||
|
msg: "Version to deploy - {{ current_version }}"
|
||||||
tags:
|
tags:
|
||||||
- pageserver
|
- pageserver
|
||||||
- safekeeper
|
- safekeeper
|
||||||
@@ -35,7 +36,7 @@
|
|||||||
- name: Deploy pageserver
|
- name: Deploy pageserver
|
||||||
hosts: pageservers
|
hosts: pageservers
|
||||||
gather_facts: False
|
gather_facts: False
|
||||||
remote_user: admin
|
remote_user: "{{ remote_user }}"
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
|
|
||||||
@@ -63,15 +64,29 @@
|
|||||||
tags:
|
tags:
|
||||||
- pageserver
|
- pageserver
|
||||||
|
|
||||||
- name: update remote storage (s3) config
|
- name: read the existing remote pageserver config
|
||||||
lineinfile:
|
ansible.builtin.slurp:
|
||||||
path: /storage/pageserver/data/pageserver.toml
|
src: /storage/pageserver/data/pageserver.toml
|
||||||
line: "{{ item }}"
|
register: _remote_ps_config
|
||||||
loop:
|
tags:
|
||||||
- "[remote_storage]"
|
- pageserver
|
||||||
- "bucket_name = '{{ bucket_name }}'"
|
|
||||||
- "bucket_region = '{{ bucket_region }}'"
|
- name: parse the existing pageserver configuration
|
||||||
- "prefix_in_bucket = '{{ inventory_hostname }}'"
|
ansible.builtin.set_fact:
|
||||||
|
_existing_ps_config: "{{ _remote_ps_config['content'] | b64decode | sivel.toiletwater.from_toml }}"
|
||||||
|
tags:
|
||||||
|
- pageserver
|
||||||
|
|
||||||
|
- name: construct the final pageserver configuration dict
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
pageserver_config: "{{ pageserver_config_stub | combine({'id': _existing_ps_config.id }) }}"
|
||||||
|
tags:
|
||||||
|
- pageserver
|
||||||
|
|
||||||
|
- name: template the pageserver config
|
||||||
|
template:
|
||||||
|
src: templates/pageserver.toml.j2
|
||||||
|
dest: /storage/pageserver/data/pageserver.toml
|
||||||
become: true
|
become: true
|
||||||
tags:
|
tags:
|
||||||
- pageserver
|
- pageserver
|
||||||
@@ -109,7 +124,7 @@
|
|||||||
- name: Deploy safekeeper
|
- name: Deploy safekeeper
|
||||||
hosts: safekeepers
|
hosts: safekeepers
|
||||||
gather_facts: False
|
gather_facts: False
|
||||||
remote_user: admin
|
remote_user: "{{ remote_user }}"
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
|
|
||||||
|
|||||||
1
.github/ansible/get_binaries.sh
vendored
1
.github/ansible/get_binaries.sh
vendored
@@ -23,6 +23,7 @@ docker cp ${ID}:/data/postgres_install.tar.gz .
|
|||||||
tar -xzf postgres_install.tar.gz -C neon_install
|
tar -xzf postgres_install.tar.gz -C neon_install
|
||||||
mkdir neon_install/bin/
|
mkdir neon_install/bin/
|
||||||
docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
|
docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
|
||||||
|
docker cp ${ID}:/usr/local/bin/pageserver_binutils neon_install/bin/
|
||||||
docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
|
docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
|
||||||
docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
|
docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
|
||||||
docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/
|
docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/
|
||||||
|
|||||||
20
.github/ansible/neon-stress.hosts
vendored
20
.github/ansible/neon-stress.hosts
vendored
@@ -1,20 +0,0 @@
|
|||||||
[pageservers]
|
|
||||||
neon-stress-ps-1 console_region_id=1
|
|
||||||
neon-stress-ps-2 console_region_id=1
|
|
||||||
|
|
||||||
[safekeepers]
|
|
||||||
neon-stress-sk-1 console_region_id=1
|
|
||||||
neon-stress-sk-2 console_region_id=1
|
|
||||||
neon-stress-sk-3 console_region_id=1
|
|
||||||
|
|
||||||
[storage:children]
|
|
||||||
pageservers
|
|
||||||
safekeepers
|
|
||||||
|
|
||||||
[storage:vars]
|
|
||||||
env_name = neon-stress
|
|
||||||
console_mgmt_base_url = http://neon-stress-console.local
|
|
||||||
bucket_name = neon-storage-ireland
|
|
||||||
bucket_region = eu-west-1
|
|
||||||
etcd_endpoints = etcd-stress.local:2379
|
|
||||||
safekeeper_enable_s3_offload = false
|
|
||||||
31
.github/ansible/neon-stress.hosts.yaml
vendored
Normal file
31
.github/ansible/neon-stress.hosts.yaml
vendored
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
storage:
|
||||||
|
vars:
|
||||||
|
bucket_name: neon-storage-ireland
|
||||||
|
bucket_region: eu-west-1
|
||||||
|
console_mgmt_base_url: http://neon-stress-console.local
|
||||||
|
env_name: neon-stress
|
||||||
|
etcd_endpoints: neon-stress-etcd.local:2379
|
||||||
|
safekeeper_enable_s3_offload: 'false'
|
||||||
|
pageserver_config_stub:
|
||||||
|
pg_distrib_dir: /usr/local
|
||||||
|
remote_storage:
|
||||||
|
bucket_name: "{{ bucket_name }}"
|
||||||
|
bucket_region: "{{ bucket_region }}"
|
||||||
|
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||||
|
hostname_suffix: ".local"
|
||||||
|
remote_user: admin
|
||||||
|
children:
|
||||||
|
pageservers:
|
||||||
|
hosts:
|
||||||
|
neon-stress-ps-1:
|
||||||
|
console_region_id: aws-eu-west-1
|
||||||
|
neon-stress-ps-2:
|
||||||
|
console_region_id: aws-eu-west-1
|
||||||
|
safekeepers:
|
||||||
|
hosts:
|
||||||
|
neon-stress-sk-1:
|
||||||
|
console_region_id: aws-eu-west-1
|
||||||
|
neon-stress-sk-2:
|
||||||
|
console_region_id: aws-eu-west-1
|
||||||
|
neon-stress-sk-3:
|
||||||
|
console_region_id: aws-eu-west-1
|
||||||
20
.github/ansible/production.hosts
vendored
20
.github/ansible/production.hosts
vendored
@@ -1,20 +0,0 @@
|
|||||||
[pageservers]
|
|
||||||
#zenith-1-ps-1 console_region_id=1
|
|
||||||
zenith-1-ps-2 console_region_id=1
|
|
||||||
zenith-1-ps-3 console_region_id=1
|
|
||||||
|
|
||||||
[safekeepers]
|
|
||||||
zenith-1-sk-1 console_region_id=1
|
|
||||||
zenith-1-sk-2 console_region_id=1
|
|
||||||
zenith-1-sk-3 console_region_id=1
|
|
||||||
|
|
||||||
[storage:children]
|
|
||||||
pageservers
|
|
||||||
safekeepers
|
|
||||||
|
|
||||||
[storage:vars]
|
|
||||||
env_name = prod-1
|
|
||||||
console_mgmt_base_url = http://console-release.local
|
|
||||||
bucket_name = zenith-storage-oregon
|
|
||||||
bucket_region = us-west-2
|
|
||||||
etcd_endpoints = zenith-1-etcd.local:2379
|
|
||||||
33
.github/ansible/production.hosts.yaml
vendored
Normal file
33
.github/ansible/production.hosts.yaml
vendored
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
---
|
||||||
|
storage:
|
||||||
|
vars:
|
||||||
|
env_name: prod-1
|
||||||
|
console_mgmt_base_url: http://console-release.local
|
||||||
|
bucket_name: zenith-storage-oregon
|
||||||
|
bucket_region: us-west-2
|
||||||
|
etcd_endpoints: zenith-1-etcd.local:2379
|
||||||
|
pageserver_config_stub:
|
||||||
|
pg_distrib_dir: /usr/local
|
||||||
|
remote_storage:
|
||||||
|
bucket_name: "{{ bucket_name }}"
|
||||||
|
bucket_region: "{{ bucket_region }}"
|
||||||
|
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||||
|
hostname_suffix: ".local"
|
||||||
|
remote_user: admin
|
||||||
|
|
||||||
|
children:
|
||||||
|
pageservers:
|
||||||
|
hosts:
|
||||||
|
zenith-1-ps-2:
|
||||||
|
console_region_id: aws-us-west-2
|
||||||
|
zenith-1-ps-3:
|
||||||
|
console_region_id: aws-us-west-2
|
||||||
|
|
||||||
|
safekeepers:
|
||||||
|
hosts:
|
||||||
|
zenith-1-sk-1:
|
||||||
|
console_region_id: aws-us-west-2
|
||||||
|
zenith-1-sk-2:
|
||||||
|
console_region_id: aws-us-west-2
|
||||||
|
zenith-1-sk-3:
|
||||||
|
console_region_id: aws-us-west-2
|
||||||
9
.github/ansible/scripts/init_pageserver.sh
vendored
9
.github/ansible/scripts/init_pageserver.sh
vendored
@@ -12,18 +12,19 @@ cat <<EOF | tee /tmp/payload
|
|||||||
"version": 1,
|
"version": 1,
|
||||||
"host": "${HOST}",
|
"host": "${HOST}",
|
||||||
"port": 6400,
|
"port": 6400,
|
||||||
"region_id": {{ console_region_id }},
|
"region_id": "{{ console_region_id }}",
|
||||||
"instance_id": "${INSTANCE_ID}",
|
"instance_id": "${INSTANCE_ID}",
|
||||||
"http_host": "${HOST}",
|
"http_host": "${HOST}",
|
||||||
"http_port": 9898
|
"http_port": 9898,
|
||||||
|
"active": false
|
||||||
}
|
}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
# check if pageserver already registered or not
|
# check if pageserver already registered or not
|
||||||
if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/pageservers/${INSTANCE_ID} -o /dev/null; then
|
if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/${INSTANCE_ID} -o /dev/null; then
|
||||||
|
|
||||||
# not registered, so register it now
|
# not registered, so register it now
|
||||||
ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/pageservers -d@/tmp/payload | jq -r '.ID')
|
ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers -d@/tmp/payload | jq -r '.id')
|
||||||
|
|
||||||
# init pageserver
|
# init pageserver
|
||||||
sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
|
sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
|
||||||
|
|||||||
10
.github/ansible/scripts/init_safekeeper.sh
vendored
10
.github/ansible/scripts/init_safekeeper.sh
vendored
@@ -14,18 +14,18 @@ cat <<EOF | tee /tmp/payload
|
|||||||
"host": "${HOST}",
|
"host": "${HOST}",
|
||||||
"port": 6500,
|
"port": 6500,
|
||||||
"http_port": 7676,
|
"http_port": 7676,
|
||||||
"region_id": {{ console_region_id }},
|
"region_id": "{{ console_region_id }}",
|
||||||
"instance_id": "${INSTANCE_ID}",
|
"instance_id": "${INSTANCE_ID}",
|
||||||
"availability_zone_id": "${AZ_ID}"
|
"availability_zone_id": "${AZ_ID}",
|
||||||
|
"active": false
|
||||||
}
|
}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
# check if safekeeper already registered or not
|
# check if safekeeper already registered or not
|
||||||
if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/safekeepers/${INSTANCE_ID} -o /dev/null; then
|
if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/${INSTANCE_ID} -o /dev/null; then
|
||||||
|
|
||||||
# not registered, so register it now
|
# not registered, so register it now
|
||||||
ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers -d@/tmp/payload | jq -r '.ID')
|
ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers -d@/tmp/payload | jq -r '.id')
|
||||||
|
|
||||||
# init safekeeper
|
# init safekeeper
|
||||||
sudo -u safekeeper /usr/local/bin/safekeeper --id ${ID} --init -D /storage/safekeeper/data
|
sudo -u safekeeper /usr/local/bin/safekeeper --id ${ID} --init -D /storage/safekeeper/data
|
||||||
fi
|
fi
|
||||||
|
|||||||
3
.github/ansible/ssm_config
vendored
Normal file
3
.github/ansible/ssm_config
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
ansible_connection: aws_ssm
|
||||||
|
ansible_aws_ssm_bucket_name: neon-dev-bucket
|
||||||
|
ansible_python_interpreter: /usr/bin/python3
|
||||||
25
.github/ansible/staging.hosts
vendored
25
.github/ansible/staging.hosts
vendored
@@ -1,25 +0,0 @@
|
|||||||
[pageservers]
|
|
||||||
#zenith-us-stage-ps-1 console_region_id=27
|
|
||||||
zenith-us-stage-ps-2 console_region_id=27
|
|
||||||
zenith-us-stage-ps-3 console_region_id=27
|
|
||||||
zenith-us-stage-ps-4 console_region_id=27
|
|
||||||
zenith-us-stage-test-ps-1 console_region_id=28
|
|
||||||
|
|
||||||
[safekeepers]
|
|
||||||
zenith-us-stage-sk-4 console_region_id=27
|
|
||||||
zenith-us-stage-sk-5 console_region_id=27
|
|
||||||
zenith-us-stage-sk-6 console_region_id=27
|
|
||||||
zenith-us-stage-test-sk-1 console_region_id=28
|
|
||||||
zenith-us-stage-test-sk-2 console_region_id=28
|
|
||||||
zenith-us-stage-test-sk-3 console_region_id=28
|
|
||||||
|
|
||||||
[storage:children]
|
|
||||||
pageservers
|
|
||||||
safekeepers
|
|
||||||
|
|
||||||
[storage:vars]
|
|
||||||
env_name = us-stage
|
|
||||||
console_mgmt_base_url = http://console-staging.local
|
|
||||||
bucket_name = zenith-staging-storage-us-east-1
|
|
||||||
bucket_region = us-east-1
|
|
||||||
etcd_endpoints = zenith-us-stage-etcd.local:2379
|
|
||||||
34
.github/ansible/staging.hosts.yaml
vendored
Normal file
34
.github/ansible/staging.hosts.yaml
vendored
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
storage:
|
||||||
|
vars:
|
||||||
|
bucket_name: zenith-staging-storage-us-east-1
|
||||||
|
bucket_region: us-east-1
|
||||||
|
console_mgmt_base_url: http://console-staging.local
|
||||||
|
env_name: us-stage
|
||||||
|
etcd_endpoints: zenith-us-stage-etcd.local:2379
|
||||||
|
pageserver_config_stub:
|
||||||
|
pg_distrib_dir: /usr/local
|
||||||
|
remote_storage:
|
||||||
|
bucket_name: "{{ bucket_name }}"
|
||||||
|
bucket_region: "{{ bucket_region }}"
|
||||||
|
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||||
|
hostname_suffix: ".local"
|
||||||
|
remote_user: admin
|
||||||
|
|
||||||
|
children:
|
||||||
|
pageservers:
|
||||||
|
hosts:
|
||||||
|
zenith-us-stage-ps-2:
|
||||||
|
console_region_id: aws-us-east-1
|
||||||
|
zenith-us-stage-ps-3:
|
||||||
|
console_region_id: aws-us-east-1
|
||||||
|
zenith-us-stage-ps-4:
|
||||||
|
console_region_id: aws-us-east-1
|
||||||
|
|
||||||
|
safekeepers:
|
||||||
|
hosts:
|
||||||
|
zenith-us-stage-sk-4:
|
||||||
|
console_region_id: aws-us-east-1
|
||||||
|
zenith-us-stage-sk-5:
|
||||||
|
console_region_id: aws-us-east-1
|
||||||
|
zenith-us-stage-sk-6:
|
||||||
|
console_region_id: aws-us-east-1
|
||||||
32
.github/ansible/staging.us-east-2.hosts.yaml
vendored
Normal file
32
.github/ansible/staging.us-east-2.hosts.yaml
vendored
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
storage:
|
||||||
|
vars:
|
||||||
|
bucket_name: neon-staging-storage-us-east-2
|
||||||
|
bucket_region: us-east-2
|
||||||
|
console_mgmt_base_url: http://console-staging.local
|
||||||
|
env_name: us-stage
|
||||||
|
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
|
||||||
|
pageserver_config_stub:
|
||||||
|
pg_distrib_dir: /usr/local
|
||||||
|
remote_storage:
|
||||||
|
bucket_name: "{{ bucket_name }}"
|
||||||
|
bucket_region: "{{ bucket_region }}"
|
||||||
|
prefix_in_bucket: "pageserver/v1"
|
||||||
|
hostname_suffix: ""
|
||||||
|
remote_user: ssm-user
|
||||||
|
ansible_aws_ssm_region: us-east-2
|
||||||
|
console_region_id: aws-us-east-2
|
||||||
|
|
||||||
|
children:
|
||||||
|
pageservers:
|
||||||
|
hosts:
|
||||||
|
pageserver-0.us-east-2.aws.neon.build:
|
||||||
|
ansible_host: i-0c3e70929edb5d691
|
||||||
|
|
||||||
|
safekeepers:
|
||||||
|
hosts:
|
||||||
|
safekeeper-0.us-east-2.aws.neon.build:
|
||||||
|
ansible_host: i-027662bd552bf5db0
|
||||||
|
safekeeper-1.us-east-2.aws.neon.build:
|
||||||
|
ansible_host: i-0171efc3604a7b907
|
||||||
|
safekeeper-2.us-east-2.aws.neon.build:
|
||||||
|
ansible_host: i-0de0b03a51676a6ce
|
||||||
2
.github/ansible/systemd/pageserver.service
vendored
2
.github/ansible/systemd/pageserver.service
vendored
@@ -1,5 +1,5 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Zenith pageserver
|
Description=Neon pageserver
|
||||||
After=network.target auditd.service
|
After=network.target auditd.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
|
|||||||
4
.github/ansible/systemd/safekeeper.service
vendored
4
.github/ansible/systemd/safekeeper.service
vendored
@@ -1,12 +1,12 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Zenith safekeeper
|
Description=Neon safekeeper
|
||||||
After=network.target auditd.service
|
After=network.target auditd.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
User=safekeeper
|
User=safekeeper
|
||||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
|
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
|
||||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
|
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
|
||||||
ExecReload=/bin/kill -HUP $MAINPID
|
ExecReload=/bin/kill -HUP $MAINPID
|
||||||
KillMode=mixed
|
KillMode=mixed
|
||||||
KillSignal=SIGINT
|
KillSignal=SIGINT
|
||||||
|
|||||||
1
.github/ansible/templates/pageserver.toml.j2
vendored
Normal file
1
.github/ansible/templates/pageserver.toml.j2
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{{ pageserver_config | sivel.toiletwater.to_toml }}
|
||||||
42
.github/workflows/benchmarking.yml
vendored
42
.github/workflows/benchmarking.yml
vendored
@@ -46,7 +46,7 @@ jobs:
|
|||||||
runs-on: [self-hosted, zenith-benchmarker]
|
runs-on: [self-hosted, zenith-benchmarker]
|
||||||
|
|
||||||
env:
|
env:
|
||||||
POSTGRES_DISTRIB_DIR: /tmp/pg_install
|
POSTGRES_DISTRIB_DIR: /usr/pgsql
|
||||||
DEFAULT_PG_VERSION: 14
|
DEFAULT_PG_VERSION: 14
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -138,22 +138,31 @@ jobs:
|
|||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
pgbench-compare:
|
pgbench-compare:
|
||||||
env:
|
|
||||||
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
|
|
||||||
TEST_PG_BENCH_SCALES_MATRIX: "10gb"
|
|
||||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
|
||||||
DEFAULT_PG_VERSION: 14
|
|
||||||
TEST_OUTPUT: /tmp/test_output
|
|
||||||
BUILD_TYPE: remote
|
|
||||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
|
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
# neon-captest-new: Run pgbench in a freshly created project
|
# neon-captest-new: Run pgbench in a freshly created project
|
||||||
# neon-captest-reuse: Same, but reusing existing project
|
# neon-captest-reuse: Same, but reusing existing project
|
||||||
# neon-captest-prefetch: Same, with prefetching enabled (new project)
|
# neon-captest-prefetch: Same, with prefetching enabled (new project)
|
||||||
platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-aurora ]
|
platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch ]
|
||||||
|
db_size: [ 10gb ]
|
||||||
|
include:
|
||||||
|
- platform: neon-captest-new
|
||||||
|
db_size: 50gb
|
||||||
|
- platform: neon-captest-prefetch
|
||||||
|
db_size: 50gb
|
||||||
|
- platform: rds-aurora
|
||||||
|
db_size: 50gb
|
||||||
|
|
||||||
|
env:
|
||||||
|
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
|
||||||
|
TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
|
||||||
|
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||||
|
DEFAULT_PG_VERSION: 14
|
||||||
|
TEST_OUTPUT: /tmp/test_output
|
||||||
|
BUILD_TYPE: remote
|
||||||
|
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
|
||||||
|
PLATFORM: ${{ matrix.platform }}
|
||||||
|
|
||||||
runs-on: dev
|
runs-on: dev
|
||||||
container:
|
container:
|
||||||
@@ -178,7 +187,7 @@ jobs:
|
|||||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||||
|
|
||||||
- name: Create Neon Project
|
- name: Create Neon Project
|
||||||
if: matrix.platform != 'neon-captest-reuse'
|
if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch"]'), matrix.platform)
|
||||||
id: create-neon-project
|
id: create-neon-project
|
||||||
uses: ./.github/actions/neon-project-create
|
uses: ./.github/actions/neon-project-create
|
||||||
with:
|
with:
|
||||||
@@ -204,11 +213,9 @@ jobs:
|
|||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
echo "::set-output name=connstr::${CONNSTR}"
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
psql ${CONNSTR} -c "SELECT version();"
|
psql ${CONNSTR} -c "SELECT version();"
|
||||||
env:
|
|
||||||
PLATFORM: ${{ matrix.platform }}
|
|
||||||
|
|
||||||
- name: Set database options
|
- name: Set database options
|
||||||
if: matrix.platform == 'neon-captest-prefetch'
|
if: matrix.platform == 'neon-captest-prefetch'
|
||||||
@@ -227,7 +234,6 @@ jobs:
|
|||||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
|
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
|
||||||
env:
|
env:
|
||||||
PLATFORM: ${{ matrix.platform }}
|
|
||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
@@ -241,7 +247,6 @@ jobs:
|
|||||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
|
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
|
||||||
env:
|
env:
|
||||||
PLATFORM: ${{ matrix.platform }}
|
|
||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
@@ -255,7 +260,6 @@ jobs:
|
|||||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
|
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
|
||||||
env:
|
env:
|
||||||
PLATFORM: ${{ matrix.platform }}
|
|
||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
@@ -268,7 +272,7 @@ jobs:
|
|||||||
build_type: ${{ env.BUILD_TYPE }}
|
build_type: ${{ env.BUILD_TYPE }}
|
||||||
|
|
||||||
- name: Delete Neon Project
|
- name: Delete Neon Project
|
||||||
if: ${{ matrix.platform != 'neon-captest-reuse' && always() }}
|
if: ${{ steps.create-neon-project.outputs.project_id && always() }}
|
||||||
uses: ./.github/actions/neon-project-delete
|
uses: ./.github/actions/neon-project-delete
|
||||||
with:
|
with:
|
||||||
environment: dev
|
environment: dev
|
||||||
|
|||||||
91
.github/workflows/build_and_test.yml
vendored
91
.github/workflows/build_and_test.yml
vendored
@@ -35,12 +35,12 @@ jobs:
|
|||||||
echo ref:$GITHUB_REF_NAME
|
echo ref:$GITHUB_REF_NAME
|
||||||
echo rev:$(git rev-list --count HEAD)
|
echo rev:$(git rev-list --count HEAD)
|
||||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||||
echo "::set-output name=tag::$(git rev-list --count HEAD)"
|
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
|
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||||
else
|
else
|
||||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||||
echo "::set-output name=tag::$GITHUB_RUN_ID"
|
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
|
||||||
fi
|
fi
|
||||||
shell: bash
|
shell: bash
|
||||||
id: build-tag
|
id: build-tag
|
||||||
@@ -78,12 +78,12 @@ jobs:
|
|||||||
|
|
||||||
- name: Set pg 14 revision for caching
|
- name: Set pg 14 revision for caching
|
||||||
id: pg_v14_rev
|
id: pg_v14_rev
|
||||||
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14)
|
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||||
shell: bash -euxo pipefail {0}
|
shell: bash -euxo pipefail {0}
|
||||||
|
|
||||||
- name: Set pg 15 revision for caching
|
- name: Set pg 15 revision for caching
|
||||||
id: pg_v15_rev
|
id: pg_v15_rev
|
||||||
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15)
|
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||||
shell: bash -euxo pipefail {0}
|
shell: bash -euxo pipefail {0}
|
||||||
|
|
||||||
# Set some environment variables used by all the steps.
|
# Set some environment variables used by all the steps.
|
||||||
@@ -494,7 +494,7 @@ jobs:
|
|||||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||||
|
|
||||||
- name: Kaniko build neon
|
- name: Kaniko build neon
|
||||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID
|
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID
|
||||||
|
|
||||||
compute-tools-image:
|
compute-tools-image:
|
||||||
runs-on: dev
|
runs-on: dev
|
||||||
@@ -508,7 +508,7 @@ jobs:
|
|||||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||||
|
|
||||||
- name: Kaniko build compute tools
|
- name: Kaniko build compute tools
|
||||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID
|
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID
|
||||||
|
|
||||||
compute-node-image:
|
compute-node-image:
|
||||||
runs-on: dev
|
runs-on: dev
|
||||||
@@ -527,7 +527,7 @@ jobs:
|
|||||||
# cloud repo depends on this image name, thus duplicating it
|
# cloud repo depends on this image name, thus duplicating it
|
||||||
# remove compute-node when cloud repo is updated
|
# remove compute-node when cloud repo is updated
|
||||||
- name: Kaniko build compute node with extensions v14 (compatibility)
|
- name: Kaniko build compute node with extensions v14 (compatibility)
|
||||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
|
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
|
||||||
|
|
||||||
compute-node-image-v14:
|
compute-node-image-v14:
|
||||||
runs-on: dev
|
runs-on: dev
|
||||||
@@ -543,7 +543,7 @@ jobs:
|
|||||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||||
|
|
||||||
- name: Kaniko build compute node with extensions v14
|
- name: Kaniko build compute node with extensions v14
|
||||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID
|
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID
|
||||||
|
|
||||||
|
|
||||||
compute-node-image-v15:
|
compute-node-image-v15:
|
||||||
@@ -560,11 +560,11 @@ jobs:
|
|||||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||||
|
|
||||||
- name: Kaniko build compute node with extensions v15
|
- name: Kaniko build compute node with extensions v15
|
||||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID
|
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID
|
||||||
|
|
||||||
promote-images:
|
promote-images:
|
||||||
runs-on: dev
|
runs-on: dev
|
||||||
needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-tools-image ]
|
needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
|
||||||
if: github.event_name != 'workflow_dispatch'
|
if: github.event_name != 'workflow_dispatch'
|
||||||
container: amazon/aws-cli
|
container: amazon/aws-cli
|
||||||
strategy:
|
strategy:
|
||||||
@@ -573,7 +573,7 @@ jobs:
|
|||||||
# compute-node uses postgres 14, which is default now
|
# compute-node uses postgres 14, which is default now
|
||||||
# cloud repo depends on this image name, thus duplicating it
|
# cloud repo depends on this image name, thus duplicating it
|
||||||
# remove compute-node when cloud repo is updated
|
# remove compute-node when cloud repo is updated
|
||||||
name: [ neon, compute-node, compute-node-v14, compute-tools ]
|
name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Promote image to latest
|
- name: Promote image to latest
|
||||||
@@ -608,6 +608,9 @@ jobs:
|
|||||||
- name: Pull compute node v14 image from ECR
|
- name: Pull compute node v14 image from ECR
|
||||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14
|
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14
|
||||||
|
|
||||||
|
- name: Pull compute node v15 image from ECR
|
||||||
|
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest compute-node-v15
|
||||||
|
|
||||||
- name: Pull rust image from ECR
|
- name: Pull rust image from ECR
|
||||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
|
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
|
||||||
|
|
||||||
@@ -619,6 +622,8 @@ jobs:
|
|||||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
|
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
|
||||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
|
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
|
||||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
|
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
|
||||||
|
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
|
||||||
|
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
|
||||||
|
|
||||||
- name: Configure Docker Hub login
|
- name: Configure Docker Hub login
|
||||||
run: |
|
run: |
|
||||||
@@ -638,6 +643,9 @@ jobs:
|
|||||||
- name: Push compute node v14 image to Docker Hub
|
- name: Push compute node v14 image to Docker Hub
|
||||||
run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
|
run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
|
||||||
|
|
||||||
|
- name: Push compute node v15 image to Docker Hub
|
||||||
|
run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
|
||||||
|
|
||||||
- name: Push rust image to Docker Hub
|
- name: Push rust image to Docker Hub
|
||||||
run: crane push rust neondatabase/rust:pinned
|
run: crane push rust neondatabase/rust:pinned
|
||||||
|
|
||||||
@@ -650,6 +658,7 @@ jobs:
|
|||||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||||
crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
|
crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
|
||||||
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||||
|
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||||
|
|
||||||
calculate-deploy-targets:
|
calculate-deploy-targets:
|
||||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||||
@@ -662,12 +671,12 @@ jobs:
|
|||||||
- id: set-matrix
|
- id: set-matrix
|
||||||
run: |
|
run: |
|
||||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||||
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}'
|
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
|
||||||
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}'
|
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}'
|
||||||
echo "::set-output name=include::[$STAGING, $NEON_STRESS]"
|
echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT
|
||||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}'
|
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
|
||||||
echo "::set-output name=include::[$PRODUCTION]"
|
echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
|
||||||
else
|
else
|
||||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||||
exit 1
|
exit 1
|
||||||
@@ -703,7 +712,7 @@ jobs:
|
|||||||
- name: Setup ansible
|
- name: Setup ansible
|
||||||
run: |
|
run: |
|
||||||
export PATH="/root/.local/bin:$PATH"
|
export PATH="/root/.local/bin:$PATH"
|
||||||
pip install --progress-bar off --user ansible boto3
|
pip install --progress-bar off --user ansible boto3 toml
|
||||||
|
|
||||||
- name: Redeploy
|
- name: Redeploy
|
||||||
run: |
|
run: |
|
||||||
@@ -725,8 +734,48 @@ jobs:
|
|||||||
chmod 0600 ssh-key
|
chmod 0600 ssh-key
|
||||||
ssh-add ssh-key
|
ssh-add ssh-key
|
||||||
rm -f ssh-key ssh-key-cert.pub
|
rm -f ssh-key ssh-key-cert.pub
|
||||||
|
ansible-galaxy collection install sivel.toiletwater
|
||||||
|
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }}
|
||||||
|
rm -f neon_install.tar.gz .neon_current_version
|
||||||
|
|
||||||
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts
|
deploy-new:
|
||||||
|
runs-on: dev
|
||||||
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||||
|
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||||
|
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||||
|
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||||
|
if: |
|
||||||
|
(github.ref_name == 'main') &&
|
||||||
|
github.event_name != 'workflow_dispatch'
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||||
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Redeploy
|
||||||
|
run: |
|
||||||
|
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||||
|
cd "$(pwd)/.github/ansible"
|
||||||
|
|
||||||
|
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||||
|
./get_binaries.sh
|
||||||
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
|
RELEASE=true ./get_binaries.sh
|
||||||
|
else
|
||||||
|
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ansible-galaxy collection install sivel.toiletwater
|
||||||
|
ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
|
||||||
rm -f neon_install.tar.gz .neon_current_version
|
rm -f neon_install.tar.gz .neon_current_version
|
||||||
|
|
||||||
deploy-proxy:
|
deploy-proxy:
|
||||||
@@ -768,5 +817,5 @@ jobs:
|
|||||||
- name: Re-deploy proxy
|
- name: Re-deploy proxy
|
||||||
run: |
|
run: |
|
||||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||||
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||||
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||||
|
|||||||
6
.github/workflows/codestyle.yml
vendored
6
.github/workflows/codestyle.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v3
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
fetch-depth: 2
|
fetch-depth: 2
|
||||||
@@ -56,12 +56,12 @@ jobs:
|
|||||||
|
|
||||||
- name: Set pg 14 revision for caching
|
- name: Set pg 14 revision for caching
|
||||||
id: pg_v14_rev
|
id: pg_v14_rev
|
||||||
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14)
|
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||||
shell: bash -euxo pipefail {0}
|
shell: bash -euxo pipefail {0}
|
||||||
|
|
||||||
- name: Set pg 15 revision for caching
|
- name: Set pg 15 revision for caching
|
||||||
id: pg_v15_rev
|
id: pg_v15_rev
|
||||||
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15)
|
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||||
shell: bash -euxo pipefail {0}
|
shell: bash -euxo pipefail {0}
|
||||||
|
|
||||||
- name: Cache postgres v14 build
|
- name: Cache postgres v14 build
|
||||||
|
|||||||
974
Cargo.lock
generated
974
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
11
Cargo.toml
11
Cargo.toml
@@ -1,3 +1,14 @@
|
|||||||
|
# 'named-profiles' feature was stabilized in cargo 1.57. This line makes the
|
||||||
|
# build work with older cargo versions.
|
||||||
|
#
|
||||||
|
# We have this because as of this writing, the latest cargo Debian package
|
||||||
|
# that's available is 1.56. (Confusingly, the Debian package version number
|
||||||
|
# is 0.57, whereas 'cargo --version' says 1.56.)
|
||||||
|
#
|
||||||
|
# See https://tracker.debian.org/pkg/cargo for the current status of the
|
||||||
|
# package. When that gets updated, we can remove this.
|
||||||
|
cargo-features = ["named-profiles"]
|
||||||
|
|
||||||
[workspace]
|
[workspace]
|
||||||
members = [
|
members = [
|
||||||
"compute_tools",
|
"compute_tools",
|
||||||
|
|||||||
10
Dockerfile
10
Dockerfile
@@ -44,7 +44,7 @@ COPY . .
|
|||||||
# Show build caching stats to check if it was used in the end.
|
# Show build caching stats to check if it was used in the end.
|
||||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||||
RUN set -e \
|
RUN set -e \
|
||||||
&& mold -run cargo build --bin pageserver --bin safekeeper --bin proxy --locked --release \
|
&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \
|
||||||
&& cachepot -s
|
&& cachepot -s
|
||||||
|
|
||||||
# Build final image
|
# Build final image
|
||||||
@@ -63,9 +63,10 @@ RUN set -e \
|
|||||||
&& useradd -d /data neon \
|
&& useradd -d /data neon \
|
||||||
&& chown -R neon:neon /data
|
&& chown -R neon:neon /data
|
||||||
|
|
||||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
|
||||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
|
||||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||||
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||||
|
|
||||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||||
@@ -85,4 +86,3 @@ VOLUME ["/data"]
|
|||||||
USER neon
|
USER neon
|
||||||
EXPOSE 6400
|
EXPOSE 6400
|
||||||
EXPOSE 9898
|
EXPOSE 9898
|
||||||
CMD ["/bin/bash"]
|
|
||||||
|
|||||||
@@ -71,10 +71,12 @@ RUN apt update && \
|
|||||||
RUN apt update && \
|
RUN apt update && \
|
||||||
apt install -y --no-install-recommends -t testing binutils
|
apt install -y --no-install-recommends -t testing binutils
|
||||||
|
|
||||||
|
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
|
||||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||||
tar xvzf v3.1.4.tar.gz && \
|
tar xvzf v3.1.4.tar.gz && \
|
||||||
cd plv8-3.1.4 && \
|
cd plv8-3.1.4 && \
|
||||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||||
|
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||||
rm -rf /plv8-* && \
|
rm -rf /plv8-* && \
|
||||||
@@ -116,8 +118,7 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
|
|||||||
#
|
#
|
||||||
FROM build-deps AS neon-pg-ext-build
|
FROM build-deps AS neon-pg-ext-build
|
||||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
# plv8 still sometimes crashes during the creation
|
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=h3-pg-build /h3/usr /
|
COPY --from=h3-pg-build /h3/usr /
|
||||||
COPY pgxn/ pgxn/
|
COPY pgxn/ pgxn/
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
ARG TAG=pinned
|
ARG TAG=pinned
|
||||||
# apparently, ARGs don't get replaced in RUN commands in kaniko
|
# apparently, ARGs don't get replaced in RUN commands in kaniko
|
||||||
# ARG POSTGIS_VERSION=3.3.0
|
# ARG POSTGIS_VERSION=3.3.1
|
||||||
# ARG PLV8_VERSION=3.1.4
|
# ARG PLV8_VERSION=3.1.4
|
||||||
# ARG PG_VERSION=v15
|
# ARG PG_VERSION=v15
|
||||||
|
|
||||||
@@ -13,9 +13,12 @@ ARG TAG=pinned
|
|||||||
# Layer "build-deps"
|
# Layer "build-deps"
|
||||||
#
|
#
|
||||||
FROM debian:bullseye-slim AS build-deps
|
FROM debian:bullseye-slim AS build-deps
|
||||||
|
RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
|
||||||
|
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
|
||||||
|
apt update
|
||||||
RUN apt update && \
|
RUN apt update && \
|
||||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
||||||
libcurl4-openssl-dev libossp-uuid-dev
|
libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
|
||||||
|
|
||||||
#
|
#
|
||||||
# Layer "pg-build"
|
# Layer "pg-build"
|
||||||
@@ -42,11 +45,11 @@ RUN cd postgres && \
|
|||||||
FROM build-deps AS postgis-build
|
FROM build-deps AS postgis-build
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
RUN apt update && \
|
RUN apt update && \
|
||||||
apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget
|
apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
|
||||||
|
|
||||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
|
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||||
tar xvzf postgis-3.3.0.tar.gz && \
|
tar xvzf postgis-3.3.1.tar.gz && \
|
||||||
cd postgis-3.3.0 && \
|
cd postgis-3.3.1 && \
|
||||||
./autogen.sh && \
|
./autogen.sh && \
|
||||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||||
./configure && \
|
./configure && \
|
||||||
@@ -64,32 +67,65 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
|
|||||||
# Build plv8
|
# Build plv8
|
||||||
#
|
#
|
||||||
FROM build-deps AS plv8-build
|
FROM build-deps AS plv8-build
|
||||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
RUN apt update && \
|
RUN apt update && \
|
||||||
apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev
|
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
|
||||||
|
|
||||||
# https://github.com/plv8/plv8/issues/475
|
# https://github.com/plv8/plv8/issues/475
|
||||||
# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
|
# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
|
||||||
RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
|
RUN apt update && \
|
||||||
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
|
|
||||||
apt update && \
|
|
||||||
apt install -y --no-install-recommends -t testing binutils
|
apt install -y --no-install-recommends -t testing binutils
|
||||||
|
|
||||||
|
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
|
||||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||||
tar xvzf v3.1.4.tar.gz && \
|
tar xvzf v3.1.4.tar.gz && \
|
||||||
cd plv8-3.1.4 && \
|
cd plv8-3.1.4 && \
|
||||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||||
|
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||||
rm -rf /plv8-* && \
|
rm -rf /plv8-* && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
|
||||||
|
|
||||||
|
#
|
||||||
|
# Layer "h3-pg-build"
|
||||||
|
# Build h3_pg
|
||||||
|
#
|
||||||
|
FROM build-deps AS h3-pg-build
|
||||||
|
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
|
||||||
|
# packaged cmake is too old
|
||||||
|
RUN apt update && \
|
||||||
|
apt install -y --no-install-recommends -t testing cmake
|
||||||
|
|
||||||
|
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
|
||||||
|
tar xvzf h3.tgz && \
|
||||||
|
cd h3-4.0.1 && \
|
||||||
|
mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
cmake .. -DCMAKE_BUILD_TYPE=Release && \
|
||||||
|
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
|
DESTDIR=/h3 make install && \
|
||||||
|
cp -R /h3/usr / && \
|
||||||
|
rm -rf build
|
||||||
|
|
||||||
|
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
|
||||||
|
tar xvzf h3-pg.tgz && \
|
||||||
|
cd h3-pg-4.0.1 && \
|
||||||
|
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||||
|
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
|
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
|
||||||
|
|
||||||
#
|
#
|
||||||
# Layer "neon-pg-ext-build"
|
# Layer "neon-pg-ext-build"
|
||||||
# compile neon extensions
|
# compile neon extensions
|
||||||
#
|
#
|
||||||
FROM build-deps AS neon-pg-ext-build
|
FROM build-deps AS neon-pg-ext-build
|
||||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
COPY --from=h3-pg-build /h3/usr /
|
||||||
COPY pgxn/ pgxn/
|
COPY pgxn/ pgxn/
|
||||||
|
|
||||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||||
@@ -137,8 +173,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
|||||||
chmod 0750 /var/db/postgres/compute && \
|
chmod 0750 /var/db/postgres/compute && \
|
||||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||||
|
|
||||||
# TODO: Check if we can make the extension setup more modular versus a linear build
|
|
||||||
# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc#
|
|
||||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||||
|
|
||||||
|
|||||||
@@ -6,10 +6,12 @@ edition = "2021"
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0"
|
anyhow = "1.0"
|
||||||
chrono = "0.4"
|
chrono = "0.4"
|
||||||
clap = "3.0"
|
clap = "4.0"
|
||||||
env_logger = "0.9"
|
env_logger = "0.9"
|
||||||
|
futures = "0.3.13"
|
||||||
hyper = { version = "0.14", features = ["full"] }
|
hyper = { version = "0.14", features = ["full"] }
|
||||||
log = { version = "0.4", features = ["std", "serde"] }
|
log = { version = "0.4", features = ["std", "serde"] }
|
||||||
|
notify = "5.0.0"
|
||||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||||
regex = "1"
|
regex = "1"
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
|||||||
@@ -51,53 +51,19 @@ fn main() -> Result<()> {
|
|||||||
// TODO: re-use `utils::logging` later
|
// TODO: re-use `utils::logging` later
|
||||||
init_logger(DEFAULT_LOG_LEVEL)?;
|
init_logger(DEFAULT_LOG_LEVEL)?;
|
||||||
|
|
||||||
// Env variable is set by `cargo`
|
let matches = cli().get_matches();
|
||||||
let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
|
|
||||||
let matches = clap::App::new("compute_ctl")
|
|
||||||
.version(version.unwrap_or("unknown"))
|
|
||||||
.arg(
|
|
||||||
Arg::new("connstr")
|
|
||||||
.short('C')
|
|
||||||
.long("connstr")
|
|
||||||
.value_name("DATABASE_URL")
|
|
||||||
.required(true),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("pgdata")
|
|
||||||
.short('D')
|
|
||||||
.long("pgdata")
|
|
||||||
.value_name("DATADIR")
|
|
||||||
.required(true),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("pgbin")
|
|
||||||
.short('b')
|
|
||||||
.long("pgbin")
|
|
||||||
.value_name("POSTGRES_PATH"),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("spec")
|
|
||||||
.short('s')
|
|
||||||
.long("spec")
|
|
||||||
.value_name("SPEC_JSON"),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("spec-path")
|
|
||||||
.short('S')
|
|
||||||
.long("spec-path")
|
|
||||||
.value_name("SPEC_PATH"),
|
|
||||||
)
|
|
||||||
.get_matches();
|
|
||||||
|
|
||||||
let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
|
let pgdata = matches
|
||||||
|
.get_one::<String>("pgdata")
|
||||||
|
.expect("PGDATA path is required");
|
||||||
let connstr = matches
|
let connstr = matches
|
||||||
.value_of("connstr")
|
.get_one::<String>("connstr")
|
||||||
.expect("Postgres connection string is required");
|
.expect("Postgres connection string is required");
|
||||||
let spec = matches.value_of("spec");
|
let spec = matches.get_one::<String>("spec");
|
||||||
let spec_path = matches.value_of("spec-path");
|
let spec_path = matches.get_one::<String>("spec-path");
|
||||||
|
|
||||||
// Try to use just 'postgres' if no path is provided
|
// Try to use just 'postgres' if no path is provided
|
||||||
let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
|
let pgbin = matches.get_one::<String>("pgbin").unwrap();
|
||||||
|
|
||||||
let spec: ComputeSpec = match spec {
|
let spec: ComputeSpec = match spec {
|
||||||
// First, try to get cluster spec from the cli argument
|
// First, try to get cluster spec from the cli argument
|
||||||
@@ -173,3 +139,48 @@ fn main() -> Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn cli() -> clap::Command {
|
||||||
|
// Env variable is set by `cargo`
|
||||||
|
let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
|
||||||
|
clap::Command::new("compute_ctl")
|
||||||
|
.version(version)
|
||||||
|
.arg(
|
||||||
|
Arg::new("connstr")
|
||||||
|
.short('C')
|
||||||
|
.long("connstr")
|
||||||
|
.value_name("DATABASE_URL")
|
||||||
|
.required(true),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("pgdata")
|
||||||
|
.short('D')
|
||||||
|
.long("pgdata")
|
||||||
|
.value_name("DATADIR")
|
||||||
|
.required(true),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("pgbin")
|
||||||
|
.short('b')
|
||||||
|
.long("pgbin")
|
||||||
|
.default_value("postgres")
|
||||||
|
.value_name("POSTGRES_PATH"),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("spec")
|
||||||
|
.short('s')
|
||||||
|
.long("spec")
|
||||||
|
.value_name("SPEC_JSON"),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("spec-path")
|
||||||
|
.short('S')
|
||||||
|
.long("spec-path")
|
||||||
|
.value_name("SPEC_PATH"),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn verify_cli() {
|
||||||
|
cli().debug_assert()
|
||||||
|
}
|
||||||
|
|||||||
@@ -178,7 +178,6 @@ impl ComputeNode {
|
|||||||
.args(&["--sync-safekeepers"])
|
.args(&["--sync-safekeepers"])
|
||||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||||
.stdout(Stdio::piped())
|
.stdout(Stdio::piped())
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.spawn()
|
.spawn()
|
||||||
.expect("postgres --sync-safekeepers failed to start");
|
.expect("postgres --sync-safekeepers failed to start");
|
||||||
|
|
||||||
@@ -191,10 +190,10 @@ impl ComputeNode {
|
|||||||
|
|
||||||
if !sync_output.status.success() {
|
if !sync_output.status.success() {
|
||||||
anyhow::bail!(
|
anyhow::bail!(
|
||||||
"postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}, stderr: {}",
|
"postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}",
|
||||||
sync_output.status,
|
sync_output.status,
|
||||||
String::from_utf8(sync_output.stdout).expect("postgres --sync-safekeepers exited, and stdout is not utf-8"),
|
String::from_utf8(sync_output.stdout)
|
||||||
String::from_utf8(sync_output.stderr).expect("postgres --sync-safekeepers exited, and stderr is not utf-8"),
|
.expect("postgres --sync-safekeepers exited, and stdout is not utf-8"),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -258,14 +257,7 @@ impl ComputeNode {
|
|||||||
.spawn()
|
.spawn()
|
||||||
.expect("cannot start postgres process");
|
.expect("cannot start postgres process");
|
||||||
|
|
||||||
// Try default Postgres port if it is not provided
|
wait_for_postgres(&mut pg, pgdata_path)?;
|
||||||
let port = self
|
|
||||||
.spec
|
|
||||||
.cluster
|
|
||||||
.settings
|
|
||||||
.find("port")
|
|
||||||
.unwrap_or_else(|| "5432".to_string());
|
|
||||||
wait_for_postgres(&mut pg, &port, pgdata_path)?;
|
|
||||||
|
|
||||||
// If connection fails,
|
// If connection fails,
|
||||||
// it may be the old node with `zenith_admin` superuser.
|
// it may be the old node with `zenith_admin` superuser.
|
||||||
|
|||||||
@@ -1,18 +1,18 @@
|
|||||||
use std::fmt::Write;
|
use std::fmt::Write;
|
||||||
|
use std::fs;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{BufRead, BufReader};
|
use std::io::{BufRead, BufReader};
|
||||||
use std::net::{SocketAddr, TcpStream};
|
|
||||||
use std::os::unix::fs::PermissionsExt;
|
use std::os::unix::fs::PermissionsExt;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::process::Child;
|
use std::process::Child;
|
||||||
use std::str::FromStr;
|
use std::time::{Duration, Instant};
|
||||||
use std::{fs, thread, time};
|
|
||||||
|
|
||||||
use anyhow::{bail, Result};
|
use anyhow::{bail, Result};
|
||||||
|
use notify::{RecursiveMode, Watcher};
|
||||||
use postgres::{Client, Transaction};
|
use postgres::{Client, Transaction};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
|
|
||||||
const POSTGRES_WAIT_TIMEOUT: u64 = 60 * 1000; // milliseconds
|
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||||
|
|
||||||
/// Rust representation of Postgres role info with only those fields
|
/// Rust representation of Postgres role info with only those fields
|
||||||
/// that matter for us.
|
/// that matter for us.
|
||||||
@@ -168,7 +168,7 @@ impl Database {
|
|||||||
/// it may require a proper quoting too.
|
/// it may require a proper quoting too.
|
||||||
pub fn to_pg_options(&self) -> String {
|
pub fn to_pg_options(&self) -> String {
|
||||||
let mut params: String = self.options.as_pg_options();
|
let mut params: String = self.options.as_pg_options();
|
||||||
write!(params, " OWNER {}", &self.owner.quote())
|
write!(params, " OWNER {}", &self.owner.pg_quote())
|
||||||
.expect("String is documented to not to error during write operations");
|
.expect("String is documented to not to error during write operations");
|
||||||
|
|
||||||
params
|
params
|
||||||
@@ -179,18 +179,17 @@ impl Database {
|
|||||||
/// intended to be used for DB / role names.
|
/// intended to be used for DB / role names.
|
||||||
pub type PgIdent = String;
|
pub type PgIdent = String;
|
||||||
|
|
||||||
/// Generic trait used to provide quoting for strings used in the
|
/// Generic trait used to provide quoting / encoding for strings used in the
|
||||||
/// Postgres SQL queries. Currently used only to implement quoting
|
/// Postgres SQL queries and DATABASE_URL.
|
||||||
/// of identifiers, but could be used for literals in the future.
|
pub trait Escaping {
|
||||||
pub trait PgQuote {
|
fn pg_quote(&self) -> String;
|
||||||
fn quote(&self) -> String;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PgQuote for PgIdent {
|
impl Escaping for PgIdent {
|
||||||
/// This is intended to mimic Postgres quote_ident(), but for simplicity it
|
/// This is intended to mimic Postgres quote_ident(), but for simplicity it
|
||||||
/// always quotes provided string with `""` and escapes every `"`. Not idempotent,
|
/// always quotes provided string with `""` and escapes every `"`.
|
||||||
/// i.e. if string is already escaped it will be escaped again.
|
/// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
|
||||||
fn quote(&self) -> String {
|
fn pg_quote(&self) -> String {
|
||||||
let result = format!("\"{}\"", self.replace('"', "\"\""));
|
let result = format!("\"{}\"", self.replace('"', "\"\""));
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
@@ -230,52 +229,112 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
|
|||||||
Ok(postgres_dbs)
|
Ok(postgres_dbs)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wait for Postgres to become ready to accept connections:
|
/// Wait for Postgres to become ready to accept connections. It's ready to
|
||||||
/// - state should be `ready` in the `pgdata/postmaster.pid`
|
/// accept connections when the state-field in `pgdata/postmaster.pid` says
|
||||||
/// - and we should be able to connect to 127.0.0.1:5432
|
/// 'ready'.
|
||||||
pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> {
|
pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
|
||||||
let pid_path = pgdata.join("postmaster.pid");
|
let pid_path = pgdata.join("postmaster.pid");
|
||||||
let mut slept: u64 = 0; // ms
|
|
||||||
let pause = time::Duration::from_millis(100);
|
|
||||||
|
|
||||||
let timeout = time::Duration::from_millis(10);
|
// PostgreSQL writes line "ready" to the postmaster.pid file, when it has
|
||||||
let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap();
|
// completed initialization and is ready to accept connections. We want to
|
||||||
|
// react quickly and perform the rest of our initialization as soon as
|
||||||
|
// PostgreSQL starts accepting connections. Use 'notify' to be notified
|
||||||
|
// whenever the PID file is changed, and whenever it changes, read it to
|
||||||
|
// check if it's now "ready".
|
||||||
|
//
|
||||||
|
// You cannot actually watch a file before it exists, so we first watch the
|
||||||
|
// data directory, and once the postmaster.pid file appears, we switch to
|
||||||
|
// watch the file instead. We also wake up every 100 ms to poll, just in
|
||||||
|
// case we miss some events for some reason. Not strictly necessary, but
|
||||||
|
// better safe than sorry.
|
||||||
|
let (tx, rx) = std::sync::mpsc::channel();
|
||||||
|
let (mut watcher, rx): (Box<dyn Watcher>, _) = match notify::recommended_watcher(move |res| {
|
||||||
|
let _ = tx.send(res);
|
||||||
|
}) {
|
||||||
|
Ok(watcher) => (Box::new(watcher), rx),
|
||||||
|
Err(e) => {
|
||||||
|
match e.kind {
|
||||||
|
notify::ErrorKind::Io(os) if os.raw_os_error() == Some(38) => {
|
||||||
|
// docker on m1 macs does not support recommended_watcher
|
||||||
|
// but return "Function not implemented (os error 38)"
|
||||||
|
// see https://github.com/notify-rs/notify/issues/423
|
||||||
|
let (tx, rx) = std::sync::mpsc::channel();
|
||||||
|
|
||||||
loop {
|
// let's poll it faster than what we check the results for (100ms)
|
||||||
// Sleep POSTGRES_WAIT_TIMEOUT at max (a bit longer actually if consider a TCP timeout,
|
let config =
|
||||||
// but postgres starts listening almost immediately, even if it is not really
|
notify::Config::default().with_poll_interval(Duration::from_millis(50));
|
||||||
// ready to accept connections).
|
|
||||||
if slept >= POSTGRES_WAIT_TIMEOUT {
|
let watcher = notify::PollWatcher::new(
|
||||||
bail!("timed out while waiting for Postgres to start");
|
move |res| {
|
||||||
|
let _ = tx.send(res);
|
||||||
|
},
|
||||||
|
config,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
(Box::new(watcher), rx)
|
||||||
|
}
|
||||||
|
_ => return Err(e.into()),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
watcher.watch(pgdata, RecursiveMode::NonRecursive)?;
|
||||||
|
|
||||||
|
let started_at = Instant::now();
|
||||||
|
let mut postmaster_pid_seen = false;
|
||||||
|
loop {
|
||||||
if let Ok(Some(status)) = pg.try_wait() {
|
if let Ok(Some(status)) = pg.try_wait() {
|
||||||
// Postgres exited, that is not what we expected, bail out earlier.
|
// Postgres exited, that is not what we expected, bail out earlier.
|
||||||
let code = status.code().unwrap_or(-1);
|
let code = status.code().unwrap_or(-1);
|
||||||
bail!("Postgres exited unexpectedly with code {}", code);
|
bail!("Postgres exited unexpectedly with code {}", code);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let res = rx.recv_timeout(Duration::from_millis(100));
|
||||||
|
log::debug!("woken up by notify: {res:?}");
|
||||||
|
// If there are multiple events in the channel already, we only need to be
|
||||||
|
// check once. Swallow the extra events before we go ahead to check the
|
||||||
|
// pid file.
|
||||||
|
while let Ok(res) = rx.try_recv() {
|
||||||
|
log::debug!("swallowing extra event: {res:?}");
|
||||||
|
}
|
||||||
|
|
||||||
// Check that we can open pid file first.
|
// Check that we can open pid file first.
|
||||||
if let Ok(file) = File::open(&pid_path) {
|
if let Ok(file) = File::open(&pid_path) {
|
||||||
|
if !postmaster_pid_seen {
|
||||||
|
log::debug!("postmaster.pid appeared");
|
||||||
|
watcher
|
||||||
|
.unwatch(pgdata)
|
||||||
|
.expect("Failed to remove pgdata dir watch");
|
||||||
|
watcher
|
||||||
|
.watch(&pid_path, RecursiveMode::NonRecursive)
|
||||||
|
.expect("Failed to add postmaster.pid file watch");
|
||||||
|
postmaster_pid_seen = true;
|
||||||
|
}
|
||||||
|
|
||||||
let file = BufReader::new(file);
|
let file = BufReader::new(file);
|
||||||
let last_line = file.lines().last();
|
let last_line = file.lines().last();
|
||||||
|
|
||||||
// Pid file could be there and we could read it, but it could be empty, for example.
|
// Pid file could be there and we could read it, but it could be empty, for example.
|
||||||
if let Some(Ok(line)) = last_line {
|
if let Some(Ok(line)) = last_line {
|
||||||
let status = line.trim();
|
let status = line.trim();
|
||||||
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
|
log::debug!("last line of postmaster.pid: {status:?}");
|
||||||
|
|
||||||
// Now Postgres is ready to accept connections
|
// Now Postgres is ready to accept connections
|
||||||
if status == "ready" && can_connect {
|
if status == "ready" {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
thread::sleep(pause);
|
// Give up after POSTGRES_WAIT_TIMEOUT.
|
||||||
slept += 100;
|
let duration = started_at.elapsed();
|
||||||
|
if duration >= POSTGRES_WAIT_TIMEOUT {
|
||||||
|
bail!("timed out while waiting for Postgres to start");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log::info!("PostgreSQL is now running, continuing to configure it");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use log::{info, log_enabled, warn, Level};
|
use log::{info, log_enabled, warn, Level};
|
||||||
|
use postgres::config::Config;
|
||||||
use postgres::{Client, NoTls};
|
use postgres::{Client, NoTls};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
|
|
||||||
@@ -115,8 +117,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
if existing_roles.iter().any(|r| r.name == op.name) {
|
if existing_roles.iter().any(|r| r.name == op.name) {
|
||||||
let query: String = format!(
|
let query: String = format!(
|
||||||
"ALTER ROLE {} RENAME TO {}",
|
"ALTER ROLE {} RENAME TO {}",
|
||||||
op.name.quote(),
|
op.name.pg_quote(),
|
||||||
new_name.quote()
|
new_name.pg_quote()
|
||||||
);
|
);
|
||||||
|
|
||||||
warn!("renaming role '{}' to '{}'", op.name, new_name);
|
warn!("renaming role '{}' to '{}'", op.name, new_name);
|
||||||
@@ -162,7 +164,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if update_role {
|
if update_role {
|
||||||
let mut query: String = format!("ALTER ROLE {} ", name.quote());
|
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
|
||||||
info_print!(" -> update");
|
info_print!(" -> update");
|
||||||
|
|
||||||
query.push_str(&role.to_pg_options());
|
query.push_str(&role.to_pg_options());
|
||||||
@@ -170,7 +172,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
info!("role name: '{}'", &name);
|
info!("role name: '{}'", &name);
|
||||||
let mut query: String = format!("CREATE ROLE {} ", name.quote());
|
let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
|
||||||
info!("role create query: '{}'", &query);
|
info!("role create query: '{}'", &query);
|
||||||
info_print!(" -> create");
|
info_print!(" -> create");
|
||||||
|
|
||||||
@@ -179,7 +181,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
|
|
||||||
let grant_query = format!(
|
let grant_query = format!(
|
||||||
"GRANT pg_read_all_data, pg_write_all_data TO {}",
|
"GRANT pg_read_all_data, pg_write_all_data TO {}",
|
||||||
name.quote()
|
name.pg_quote()
|
||||||
);
|
);
|
||||||
xact.execute(grant_query.as_str(), &[])?;
|
xact.execute(grant_query.as_str(), &[])?;
|
||||||
info!("role grant query: '{}'", &grant_query);
|
info!("role grant query: '{}'", &grant_query);
|
||||||
@@ -215,7 +217,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
|
|||||||
// We do not check either role exists or not,
|
// We do not check either role exists or not,
|
||||||
// Postgres will take care of it for us
|
// Postgres will take care of it for us
|
||||||
if op.action == "delete_role" {
|
if op.action == "delete_role" {
|
||||||
let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
|
let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.pg_quote());
|
||||||
|
|
||||||
warn!("deleting role '{}'", &op.name);
|
warn!("deleting role '{}'", &op.name);
|
||||||
xact.execute(query.as_str(), &[])?;
|
xact.execute(query.as_str(), &[])?;
|
||||||
@@ -230,17 +232,16 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
|
|||||||
fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
|
fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
|
||||||
for db in &node.spec.cluster.databases {
|
for db in &node.spec.cluster.databases {
|
||||||
if db.owner != *role_name {
|
if db.owner != *role_name {
|
||||||
let mut connstr = node.connstr.clone();
|
let mut conf = Config::from_str(node.connstr.as_str())?;
|
||||||
// database name is always the last and the only component of the path
|
conf.dbname(&db.name);
|
||||||
connstr.set_path(&db.name);
|
|
||||||
|
|
||||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
let mut client = conf.connect(NoTls)?;
|
||||||
|
|
||||||
// This will reassign all dependent objects to the db owner
|
// This will reassign all dependent objects to the db owner
|
||||||
let reassign_query = format!(
|
let reassign_query = format!(
|
||||||
"REASSIGN OWNED BY {} TO {}",
|
"REASSIGN OWNED BY {} TO {}",
|
||||||
role_name.quote(),
|
role_name.pg_quote(),
|
||||||
db.owner.quote()
|
db.owner.pg_quote()
|
||||||
);
|
);
|
||||||
info!(
|
info!(
|
||||||
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||||
@@ -249,7 +250,7 @@ fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()>
|
|||||||
client.simple_query(&reassign_query)?;
|
client.simple_query(&reassign_query)?;
|
||||||
|
|
||||||
// This now will only drop privileges of the role
|
// This now will only drop privileges of the role
|
||||||
let drop_query = format!("DROP OWNED BY {}", role_name.quote());
|
let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
|
||||||
client.simple_query(&drop_query)?;
|
client.simple_query(&drop_query)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -279,7 +280,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
// We do not check either DB exists or not,
|
// We do not check either DB exists or not,
|
||||||
// Postgres will take care of it for us
|
// Postgres will take care of it for us
|
||||||
"delete_db" => {
|
"delete_db" => {
|
||||||
let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.quote());
|
let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote());
|
||||||
|
|
||||||
warn!("deleting database '{}'", &op.name);
|
warn!("deleting database '{}'", &op.name);
|
||||||
client.execute(query.as_str(), &[])?;
|
client.execute(query.as_str(), &[])?;
|
||||||
@@ -291,8 +292,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
if existing_dbs.iter().any(|r| r.name == op.name) {
|
if existing_dbs.iter().any(|r| r.name == op.name) {
|
||||||
let query: String = format!(
|
let query: String = format!(
|
||||||
"ALTER DATABASE {} RENAME TO {}",
|
"ALTER DATABASE {} RENAME TO {}",
|
||||||
op.name.quote(),
|
op.name.pg_quote(),
|
||||||
new_name.quote()
|
new_name.pg_quote()
|
||||||
);
|
);
|
||||||
|
|
||||||
warn!("renaming database '{}' to '{}'", op.name, new_name);
|
warn!("renaming database '{}' to '{}'", op.name, new_name);
|
||||||
@@ -320,7 +321,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
// XXX: db owner name is returned as quoted string from Postgres,
|
// XXX: db owner name is returned as quoted string from Postgres,
|
||||||
// when quoting is needed.
|
// when quoting is needed.
|
||||||
let new_owner = if r.owner.starts_with('"') {
|
let new_owner = if r.owner.starts_with('"') {
|
||||||
db.owner.quote()
|
db.owner.pg_quote()
|
||||||
} else {
|
} else {
|
||||||
db.owner.clone()
|
db.owner.clone()
|
||||||
};
|
};
|
||||||
@@ -328,15 +329,15 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
if new_owner != r.owner {
|
if new_owner != r.owner {
|
||||||
let query: String = format!(
|
let query: String = format!(
|
||||||
"ALTER DATABASE {} OWNER TO {}",
|
"ALTER DATABASE {} OWNER TO {}",
|
||||||
name.quote(),
|
name.pg_quote(),
|
||||||
db.owner.quote()
|
db.owner.pg_quote()
|
||||||
);
|
);
|
||||||
info_print!(" -> update");
|
info_print!(" -> update");
|
||||||
|
|
||||||
client.execute(query.as_str(), &[])?;
|
client.execute(query.as_str(), &[])?;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let mut query: String = format!("CREATE DATABASE {} ", name.quote());
|
let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
|
||||||
info_print!(" -> create");
|
info_print!(" -> create");
|
||||||
|
|
||||||
query.push_str(&db.to_pg_options());
|
query.push_str(&db.to_pg_options());
|
||||||
@@ -366,7 +367,7 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
|||||||
.cluster
|
.cluster
|
||||||
.roles
|
.roles
|
||||||
.iter()
|
.iter()
|
||||||
.map(|r| r.name.quote())
|
.map(|r| r.name.pg_quote())
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
for db in &spec.cluster.databases {
|
for db in &spec.cluster.databases {
|
||||||
@@ -374,7 +375,7 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
|||||||
|
|
||||||
let query: String = format!(
|
let query: String = format!(
|
||||||
"GRANT CREATE ON DATABASE {} TO {}",
|
"GRANT CREATE ON DATABASE {} TO {}",
|
||||||
dbname.quote(),
|
dbname.pg_quote(),
|
||||||
roles.join(", ")
|
roles.join(", ")
|
||||||
);
|
);
|
||||||
info!("grant query {}", &query);
|
info!("grant query {}", &query);
|
||||||
@@ -385,12 +386,11 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
|||||||
// Do some per-database access adjustments. We'd better do this at db creation time,
|
// Do some per-database access adjustments. We'd better do this at db creation time,
|
||||||
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
|
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
|
||||||
// atomically.
|
// atomically.
|
||||||
let mut db_connstr = node.connstr.clone();
|
|
||||||
for db in &node.spec.cluster.databases {
|
for db in &node.spec.cluster.databases {
|
||||||
// database name is always the last and the only component of the path
|
let mut conf = Config::from_str(node.connstr.as_str())?;
|
||||||
db_connstr.set_path(&db.name);
|
conf.dbname(&db.name);
|
||||||
|
|
||||||
let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?;
|
let mut db_client = conf.connect(NoTls)?;
|
||||||
|
|
||||||
// This will only change ownership on the schema itself, not the objects
|
// This will only change ownership on the schema itself, not the objects
|
||||||
// inside it. Without it owner of the `public` schema will be `cloud_admin`
|
// inside it. Without it owner of the `public` schema will be `cloud_admin`
|
||||||
@@ -419,9 +419,15 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
|||||||
END IF;\n\
|
END IF;\n\
|
||||||
END\n\
|
END\n\
|
||||||
$$;",
|
$$;",
|
||||||
db.owner.quote()
|
db.owner.pg_quote()
|
||||||
);
|
);
|
||||||
db_client.simple_query(&alter_query)?;
|
db_client.simple_query(&alter_query)?;
|
||||||
|
|
||||||
|
// Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
|
||||||
|
// This is needed since postgres 15, where this privilege is removed by default.
|
||||||
|
let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string();
|
||||||
|
info!("grant query for db {} : {}", &db.name, &grant_query);
|
||||||
|
db_client.simple_query(&grant_query)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -33,9 +33,9 @@ mod pg_helpers_tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn quote_ident() {
|
fn ident_pg_quote() {
|
||||||
let ident: PgIdent = PgIdent::from("\"name\";\\n select 1;");
|
let ident: PgIdent = PgIdent::from("\"name\";\\n select 1;");
|
||||||
|
|
||||||
assert_eq!(ident.quote(), "\"\"\"name\"\";\\n select 1;\"");
|
assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,22 +4,24 @@ version = "0.1.0"
|
|||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
clap = "3.0"
|
clap = "4.0"
|
||||||
comfy-table = "5.0.1"
|
comfy-table = "6.1"
|
||||||
git-version = "0.3.5"
|
git-version = "0.3.5"
|
||||||
tar = "0.4.38"
|
tar = "0.4.38"
|
||||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_with = "1.12.0"
|
serde_with = "2.0"
|
||||||
toml = "0.5"
|
toml = "0.5"
|
||||||
once_cell = "1.13.0"
|
once_cell = "1.13.0"
|
||||||
regex = "1"
|
regex = "1"
|
||||||
anyhow = "1.0"
|
anyhow = "1.0"
|
||||||
thiserror = "1"
|
thiserror = "1"
|
||||||
nix = "0.23"
|
nix = "0.25"
|
||||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||||
|
|
||||||
pageserver = { path = "../pageserver" }
|
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||||
safekeeper = { path = "../safekeeper" }
|
# instead, so that recompile times are better.
|
||||||
|
pageserver_api = { path = "../libs/pageserver_api" }
|
||||||
|
safekeeper_api = { path = "../libs/safekeeper_api" }
|
||||||
utils = { path = "../libs/utils" }
|
utils = { path = "../libs/utils" }
|
||||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
|
|||||||
@@ -6,18 +6,18 @@
|
|||||||
//! rely on `neon_local` to set up the environment for each test.
|
//! rely on `neon_local` to set up the environment for each test.
|
||||||
//!
|
//!
|
||||||
use anyhow::{anyhow, bail, Context, Result};
|
use anyhow::{anyhow, bail, Context, Result};
|
||||||
use clap::{App, AppSettings, Arg, ArgMatches};
|
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||||
use control_plane::compute::ComputeControlPlane;
|
use control_plane::compute::ComputeControlPlane;
|
||||||
use control_plane::local_env::{EtcdBroker, LocalEnv};
|
use control_plane::local_env::{EtcdBroker, LocalEnv};
|
||||||
use control_plane::safekeeper::SafekeeperNode;
|
use control_plane::safekeeper::SafekeeperNode;
|
||||||
use control_plane::storage::PageServerNode;
|
use control_plane::storage::PageServerNode;
|
||||||
use control_plane::{etcd, local_env};
|
use control_plane::{etcd, local_env};
|
||||||
use pageserver::config::defaults::{
|
use pageserver_api::models::TimelineInfo;
|
||||||
|
use pageserver_api::{
|
||||||
DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
|
DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
|
||||||
DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
|
DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
|
||||||
};
|
};
|
||||||
use pageserver::http::models::TimelineInfo;
|
use safekeeper_api::{
|
||||||
use safekeeper::defaults::{
|
|
||||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||||
};
|
};
|
||||||
@@ -85,212 +85,7 @@ struct TimelineTreeEl {
|
|||||||
// * Providing CLI api to the pageserver
|
// * Providing CLI api to the pageserver
|
||||||
// * TODO: export/import to/from usual postgres
|
// * TODO: export/import to/from usual postgres
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
let branch_name_arg = Arg::new("branch-name")
|
let matches = cli().get_matches();
|
||||||
.long("branch-name")
|
|
||||||
.takes_value(true)
|
|
||||||
.help("Name of the branch to be created or used as an alias for other services")
|
|
||||||
.required(false);
|
|
||||||
|
|
||||||
let pg_node_arg = Arg::new("node").help("Postgres node name").required(false);
|
|
||||||
|
|
||||||
let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
|
|
||||||
|
|
||||||
let tenant_id_arg = Arg::new("tenant-id")
|
|
||||||
.long("tenant-id")
|
|
||||||
.help("Tenant id. Represented as a hexadecimal string 32 symbols length")
|
|
||||||
.takes_value(true)
|
|
||||||
.required(false);
|
|
||||||
|
|
||||||
let timeline_id_arg = Arg::new("timeline-id")
|
|
||||||
.long("timeline-id")
|
|
||||||
.help("Timeline id. Represented as a hexadecimal string 32 symbols length")
|
|
||||||
.takes_value(true)
|
|
||||||
.required(false);
|
|
||||||
|
|
||||||
let pg_version_arg = Arg::new("pg-version")
|
|
||||||
.long("pg-version")
|
|
||||||
.help("Postgres version to use for the initial tenant")
|
|
||||||
.required(false)
|
|
||||||
.takes_value(true)
|
|
||||||
.default_value(DEFAULT_PG_VERSION);
|
|
||||||
|
|
||||||
let port_arg = Arg::new("port")
|
|
||||||
.long("port")
|
|
||||||
.required(false)
|
|
||||||
.value_name("port");
|
|
||||||
|
|
||||||
let stop_mode_arg = Arg::new("stop-mode")
|
|
||||||
.short('m')
|
|
||||||
.takes_value(true)
|
|
||||||
.possible_values(&["fast", "immediate"])
|
|
||||||
.help("If 'immediate', don't flush repository data at shutdown")
|
|
||||||
.required(false)
|
|
||||||
.value_name("stop-mode");
|
|
||||||
|
|
||||||
let pageserver_config_args = Arg::new("pageserver-config-override")
|
|
||||||
.long("pageserver-config-override")
|
|
||||||
.takes_value(true)
|
|
||||||
.number_of_values(1)
|
|
||||||
.multiple_occurrences(true)
|
|
||||||
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
|
|
||||||
.required(false);
|
|
||||||
|
|
||||||
let lsn_arg = Arg::new("lsn")
|
|
||||||
.long("lsn")
|
|
||||||
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
|
|
||||||
.takes_value(true)
|
|
||||||
.required(false);
|
|
||||||
|
|
||||||
let matches = App::new("Neon CLI")
|
|
||||||
.setting(AppSettings::ArgRequiredElseHelp)
|
|
||||||
.version(GIT_VERSION)
|
|
||||||
.subcommand(
|
|
||||||
App::new("init")
|
|
||||||
.about("Initialize a new Neon repository")
|
|
||||||
.arg(pageserver_config_args.clone())
|
|
||||||
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
|
|
||||||
.arg(
|
|
||||||
Arg::new("config")
|
|
||||||
.long("config")
|
|
||||||
.required(false)
|
|
||||||
.value_name("config"),
|
|
||||||
)
|
|
||||||
.arg(pg_version_arg.clone())
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
App::new("timeline")
|
|
||||||
.about("Manage timelines")
|
|
||||||
.subcommand(App::new("list")
|
|
||||||
.about("List all timelines, available to this pageserver")
|
|
||||||
.arg(tenant_id_arg.clone()))
|
|
||||||
.subcommand(App::new("branch")
|
|
||||||
.about("Create a new timeline, using another timeline as a base, copying its data")
|
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(branch_name_arg.clone())
|
|
||||||
.arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name").takes_value(true)
|
|
||||||
.help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
|
|
||||||
.arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn").takes_value(true)
|
|
||||||
.help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false)))
|
|
||||||
.subcommand(App::new("create")
|
|
||||||
.about("Create a new blank timeline")
|
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(branch_name_arg.clone())
|
|
||||||
.arg(pg_version_arg.clone())
|
|
||||||
)
|
|
||||||
.subcommand(App::new("import")
|
|
||||||
.about("Import timeline from basebackup directory")
|
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(timeline_id_arg.clone())
|
|
||||||
.arg(Arg::new("node-name").long("node-name").takes_value(true)
|
|
||||||
.help("Name to assign to the imported timeline"))
|
|
||||||
.arg(Arg::new("base-tarfile").long("base-tarfile").takes_value(true)
|
|
||||||
.help("Basebackup tarfile to import"))
|
|
||||||
.arg(Arg::new("base-lsn").long("base-lsn").takes_value(true)
|
|
||||||
.help("Lsn the basebackup starts at"))
|
|
||||||
.arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true)
|
|
||||||
.help("Wal to add after base"))
|
|
||||||
.arg(Arg::new("end-lsn").long("end-lsn").takes_value(true)
|
|
||||||
.help("Lsn the basebackup ends at"))
|
|
||||||
.arg(pg_version_arg.clone())
|
|
||||||
)
|
|
||||||
).subcommand(
|
|
||||||
App::new("tenant")
|
|
||||||
.setting(AppSettings::ArgRequiredElseHelp)
|
|
||||||
.about("Manage tenants")
|
|
||||||
.subcommand(App::new("list"))
|
|
||||||
.subcommand(App::new("create")
|
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
|
|
||||||
.arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
|
|
||||||
.arg(pg_version_arg.clone())
|
|
||||||
)
|
|
||||||
.subcommand(App::new("config")
|
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
|
|
||||||
)
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
App::new("pageserver")
|
|
||||||
.setting(AppSettings::ArgRequiredElseHelp)
|
|
||||||
.about("Manage pageserver")
|
|
||||||
.subcommand(App::new("status"))
|
|
||||||
.subcommand(App::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
|
||||||
.subcommand(App::new("stop").about("Stop local pageserver")
|
|
||||||
.arg(stop_mode_arg.clone()))
|
|
||||||
.subcommand(App::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
App::new("safekeeper")
|
|
||||||
.setting(AppSettings::ArgRequiredElseHelp)
|
|
||||||
.about("Manage safekeepers")
|
|
||||||
.subcommand(App::new("start")
|
|
||||||
.about("Start local safekeeper")
|
|
||||||
.arg(safekeeper_id_arg.clone())
|
|
||||||
)
|
|
||||||
.subcommand(App::new("stop")
|
|
||||||
.about("Stop local safekeeper")
|
|
||||||
.arg(safekeeper_id_arg.clone())
|
|
||||||
.arg(stop_mode_arg.clone())
|
|
||||||
)
|
|
||||||
.subcommand(App::new("restart")
|
|
||||||
.about("Restart local safekeeper")
|
|
||||||
.arg(safekeeper_id_arg.clone())
|
|
||||||
.arg(stop_mode_arg.clone())
|
|
||||||
)
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
App::new("pg")
|
|
||||||
.setting(AppSettings::ArgRequiredElseHelp)
|
|
||||||
.about("Manage postgres instances")
|
|
||||||
.subcommand(App::new("list").arg(tenant_id_arg.clone()))
|
|
||||||
.subcommand(App::new("create")
|
|
||||||
.about("Create a postgres compute node")
|
|
||||||
.arg(pg_node_arg.clone())
|
|
||||||
.arg(branch_name_arg.clone())
|
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(lsn_arg.clone())
|
|
||||||
.arg(port_arg.clone())
|
|
||||||
.arg(
|
|
||||||
Arg::new("config-only")
|
|
||||||
.help("Don't do basebackup, create compute node with only config files")
|
|
||||||
.long("config-only")
|
|
||||||
.required(false))
|
|
||||||
.arg(pg_version_arg.clone())
|
|
||||||
)
|
|
||||||
.subcommand(App::new("start")
|
|
||||||
.about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
|
|
||||||
.arg(pg_node_arg.clone())
|
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(branch_name_arg.clone())
|
|
||||||
.arg(timeline_id_arg.clone())
|
|
||||||
.arg(lsn_arg.clone())
|
|
||||||
.arg(port_arg.clone())
|
|
||||||
.arg(pg_version_arg.clone())
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
App::new("stop")
|
|
||||||
.arg(pg_node_arg.clone())
|
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(
|
|
||||||
Arg::new("destroy")
|
|
||||||
.help("Also delete data directory (now optional, should be default in future)")
|
|
||||||
.long("destroy")
|
|
||||||
.required(false)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
App::new("start")
|
|
||||||
.about("Start page server and safekeepers")
|
|
||||||
.arg(pageserver_config_args)
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
App::new("stop")
|
|
||||||
.about("Stop page server and safekeepers")
|
|
||||||
.arg(stop_mode_arg.clone())
|
|
||||||
)
|
|
||||||
.get_matches();
|
|
||||||
|
|
||||||
let (sub_name, sub_args) = match matches.subcommand() {
|
let (sub_name, sub_args) = match matches.subcommand() {
|
||||||
Some(subcommand_data) => subcommand_data,
|
Some(subcommand_data) => subcommand_data,
|
||||||
@@ -358,9 +153,7 @@ fn print_timelines_tree(
|
|||||||
|
|
||||||
// Memorize all direct children of each timeline.
|
// Memorize all direct children of each timeline.
|
||||||
for timeline in timelines.iter() {
|
for timeline in timelines.iter() {
|
||||||
if let Some(ancestor_timeline_id) =
|
if let Some(ancestor_timeline_id) = timeline.ancestor_timeline_id {
|
||||||
timeline.local.as_ref().and_then(|l| l.ancestor_timeline_id)
|
|
||||||
{
|
|
||||||
timelines_hash
|
timelines_hash
|
||||||
.get_mut(&ancestor_timeline_id)
|
.get_mut(&ancestor_timeline_id)
|
||||||
.context("missing timeline info in the HashMap")?
|
.context("missing timeline info in the HashMap")?
|
||||||
@@ -371,13 +164,7 @@ fn print_timelines_tree(
|
|||||||
|
|
||||||
for timeline in timelines_hash.values() {
|
for timeline in timelines_hash.values() {
|
||||||
// Start with root local timelines (no ancestors) first.
|
// Start with root local timelines (no ancestors) first.
|
||||||
if timeline
|
if timeline.info.ancestor_timeline_id.is_none() {
|
||||||
.info
|
|
||||||
.local
|
|
||||||
.as_ref()
|
|
||||||
.and_then(|l| l.ancestor_timeline_id)
|
|
||||||
.is_none()
|
|
||||||
{
|
|
||||||
print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?;
|
print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -394,17 +181,8 @@ fn print_timeline(
|
|||||||
timeline: &TimelineTreeEl,
|
timeline: &TimelineTreeEl,
|
||||||
timelines: &HashMap<TimelineId, TimelineTreeEl>,
|
timelines: &HashMap<TimelineId, TimelineTreeEl>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let local_remote = match (timeline.info.local.as_ref(), timeline.info.remote.as_ref()) {
|
|
||||||
(None, None) => unreachable!("in this case no info for a timeline is found"),
|
|
||||||
(None, Some(_)) => "(R)",
|
|
||||||
(Some(_), None) => "(L)",
|
|
||||||
(Some(_), Some(_)) => "(L+R)",
|
|
||||||
};
|
|
||||||
// Draw main padding
|
|
||||||
print!("{} ", local_remote);
|
|
||||||
|
|
||||||
if nesting_level > 0 {
|
if nesting_level > 0 {
|
||||||
let ancestor_lsn = match timeline.info.local.as_ref().and_then(|i| i.ancestor_lsn) {
|
let ancestor_lsn = match timeline.info.ancestor_lsn {
|
||||||
Some(lsn) => lsn.to_string(),
|
Some(lsn) => lsn.to_string(),
|
||||||
None => "Unknown Lsn".to_string(),
|
None => "Unknown Lsn".to_string(),
|
||||||
};
|
};
|
||||||
@@ -492,16 +270,16 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
|
|||||||
|
|
||||||
fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
|
fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
|
||||||
sub_match
|
sub_match
|
||||||
.value_of("tenant-id")
|
.get_one::<String>("tenant-id")
|
||||||
.map(TenantId::from_str)
|
.map(|tenant_id| TenantId::from_str(tenant_id))
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse tenant id from the argument string")
|
.context("Failed to parse tenant id from the argument string")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
|
fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
|
||||||
sub_match
|
sub_match
|
||||||
.value_of("timeline-id")
|
.get_one::<String>("timeline-id")
|
||||||
.map(TimelineId::from_str)
|
.map(|timeline_id| TimelineId::from_str(timeline_id))
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse timeline id from the argument string")
|
.context("Failed to parse timeline id from the argument string")
|
||||||
}
|
}
|
||||||
@@ -510,19 +288,22 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
|||||||
let initial_timeline_id_arg = parse_timeline_id(init_match)?;
|
let initial_timeline_id_arg = parse_timeline_id(init_match)?;
|
||||||
|
|
||||||
// Create config file
|
// Create config file
|
||||||
let toml_file: String = if let Some(config_path) = init_match.value_of("config") {
|
let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
|
||||||
// load and parse the file
|
// load and parse the file
|
||||||
std::fs::read_to_string(std::path::Path::new(config_path))
|
std::fs::read_to_string(config_path).with_context(|| {
|
||||||
.with_context(|| format!("Could not read configuration file '{config_path}'"))?
|
format!(
|
||||||
|
"Could not read configuration file '{}'",
|
||||||
|
config_path.display()
|
||||||
|
)
|
||||||
|
})?
|
||||||
} else {
|
} else {
|
||||||
// Built-in default config
|
// Built-in default config
|
||||||
default_conf(&EtcdBroker::locate_etcd()?)
|
default_conf(&EtcdBroker::locate_etcd()?)
|
||||||
};
|
};
|
||||||
|
|
||||||
let pg_version = init_match
|
let pg_version = init_match
|
||||||
.value_of("pg-version")
|
.get_one::<u32>("pg-version")
|
||||||
.unwrap()
|
.copied()
|
||||||
.parse::<u32>()
|
|
||||||
.context("Failed to parse postgres version from the argument string")?;
|
.context("Failed to parse postgres version from the argument string")?;
|
||||||
|
|
||||||
let mut env =
|
let mut env =
|
||||||
@@ -558,9 +339,10 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
|||||||
|
|
||||||
fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
|
fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
|
||||||
init_match
|
init_match
|
||||||
.values_of("pageserver-config-override")
|
.get_many::<String>("pageserver-config-override")
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.flatten()
|
.flatten()
|
||||||
|
.map(|s| s.as_str())
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -575,7 +357,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
|||||||
Some(("create", create_match)) => {
|
Some(("create", create_match)) => {
|
||||||
let initial_tenant_id = parse_tenant_id(create_match)?;
|
let initial_tenant_id = parse_tenant_id(create_match)?;
|
||||||
let tenant_conf: HashMap<_, _> = create_match
|
let tenant_conf: HashMap<_, _> = create_match
|
||||||
.values_of("config")
|
.get_many::<String>("config")
|
||||||
.map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
|
.map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?;
|
let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?;
|
||||||
@@ -584,9 +366,8 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
|||||||
// Create an initial timeline for the new tenant
|
// Create an initial timeline for the new tenant
|
||||||
let new_timeline_id = parse_timeline_id(create_match)?;
|
let new_timeline_id = parse_timeline_id(create_match)?;
|
||||||
let pg_version = create_match
|
let pg_version = create_match
|
||||||
.value_of("pg-version")
|
.get_one::<u32>("pg-version")
|
||||||
.unwrap()
|
.copied()
|
||||||
.parse::<u32>()
|
|
||||||
.context("Failed to parse postgres version from the argument string")?;
|
.context("Failed to parse postgres version from the argument string")?;
|
||||||
|
|
||||||
let timeline_info = pageserver.timeline_create(
|
let timeline_info = pageserver.timeline_create(
|
||||||
@@ -597,10 +378,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
|||||||
Some(pg_version),
|
Some(pg_version),
|
||||||
)?;
|
)?;
|
||||||
let new_timeline_id = timeline_info.timeline_id;
|
let new_timeline_id = timeline_info.timeline_id;
|
||||||
let last_record_lsn = timeline_info
|
let last_record_lsn = timeline_info.last_record_lsn;
|
||||||
.local
|
|
||||||
.context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))?
|
|
||||||
.last_record_lsn;
|
|
||||||
|
|
||||||
env.register_branch_mapping(
|
env.register_branch_mapping(
|
||||||
DEFAULT_BRANCH_NAME.to_string(),
|
DEFAULT_BRANCH_NAME.to_string(),
|
||||||
@@ -615,7 +393,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
|||||||
Some(("config", create_match)) => {
|
Some(("config", create_match)) => {
|
||||||
let tenant_id = get_tenant_id(create_match, env)?;
|
let tenant_id = get_tenant_id(create_match, env)?;
|
||||||
let tenant_conf: HashMap<_, _> = create_match
|
let tenant_conf: HashMap<_, _> = create_match
|
||||||
.values_of("config")
|
.get_many::<String>("config")
|
||||||
.map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
|
.map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|
||||||
@@ -642,23 +420,19 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
|||||||
Some(("create", create_match)) => {
|
Some(("create", create_match)) => {
|
||||||
let tenant_id = get_tenant_id(create_match, env)?;
|
let tenant_id = get_tenant_id(create_match, env)?;
|
||||||
let new_branch_name = create_match
|
let new_branch_name = create_match
|
||||||
.value_of("branch-name")
|
.get_one::<String>("branch-name")
|
||||||
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
||||||
|
|
||||||
let pg_version = create_match
|
let pg_version = create_match
|
||||||
.value_of("pg-version")
|
.get_one::<u32>("pg-version")
|
||||||
.unwrap()
|
.copied()
|
||||||
.parse::<u32>()
|
|
||||||
.context("Failed to parse postgres version from the argument string")?;
|
.context("Failed to parse postgres version from the argument string")?;
|
||||||
|
|
||||||
let timeline_info =
|
let timeline_info =
|
||||||
pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
|
pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
|
||||||
let new_timeline_id = timeline_info.timeline_id;
|
let new_timeline_id = timeline_info.timeline_id;
|
||||||
|
|
||||||
let last_record_lsn = timeline_info
|
let last_record_lsn = timeline_info.last_record_lsn;
|
||||||
.local
|
|
||||||
.expect("no local timeline info")
|
|
||||||
.last_record_lsn;
|
|
||||||
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
|
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
|
||||||
|
|
||||||
println!(
|
println!(
|
||||||
@@ -670,35 +444,32 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
|||||||
let tenant_id = get_tenant_id(import_match, env)?;
|
let tenant_id = get_tenant_id(import_match, env)?;
|
||||||
let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided");
|
let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided");
|
||||||
let name = import_match
|
let name = import_match
|
||||||
.value_of("node-name")
|
.get_one::<String>("node-name")
|
||||||
.ok_or_else(|| anyhow!("No node name provided"))?;
|
.ok_or_else(|| anyhow!("No node name provided"))?;
|
||||||
|
|
||||||
// Parse base inputs
|
// Parse base inputs
|
||||||
let base_tarfile = import_match
|
let base_tarfile = import_match
|
||||||
.value_of("base-tarfile")
|
.get_one::<PathBuf>("base-tarfile")
|
||||||
.map(|s| PathBuf::from_str(s).unwrap())
|
.ok_or_else(|| anyhow!("No base-tarfile provided"))?
|
||||||
.ok_or_else(|| anyhow!("No base-tarfile provided"))?;
|
.to_owned();
|
||||||
let base_lsn = Lsn::from_str(
|
let base_lsn = Lsn::from_str(
|
||||||
import_match
|
import_match
|
||||||
.value_of("base-lsn")
|
.get_one::<String>("base-lsn")
|
||||||
.ok_or_else(|| anyhow!("No base-lsn provided"))?,
|
.ok_or_else(|| anyhow!("No base-lsn provided"))?,
|
||||||
)?;
|
)?;
|
||||||
let base = (base_lsn, base_tarfile);
|
let base = (base_lsn, base_tarfile);
|
||||||
|
|
||||||
// Parse pg_wal inputs
|
// Parse pg_wal inputs
|
||||||
let wal_tarfile = import_match
|
let wal_tarfile = import_match.get_one::<PathBuf>("wal-tarfile").cloned();
|
||||||
.value_of("wal-tarfile")
|
|
||||||
.map(|s| PathBuf::from_str(s).unwrap());
|
|
||||||
let end_lsn = import_match
|
let end_lsn = import_match
|
||||||
.value_of("end-lsn")
|
.get_one::<String>("end-lsn")
|
||||||
.map(|s| Lsn::from_str(s).unwrap());
|
.map(|s| Lsn::from_str(s).unwrap());
|
||||||
// TODO validate both or none are provided
|
// TODO validate both or none are provided
|
||||||
let pg_wal = end_lsn.zip(wal_tarfile);
|
let pg_wal = end_lsn.zip(wal_tarfile);
|
||||||
|
|
||||||
let pg_version = import_match
|
let pg_version = import_match
|
||||||
.value_of("pg-version")
|
.get_one::<u32>("pg-version")
|
||||||
.unwrap()
|
.copied()
|
||||||
.parse::<u32>()
|
|
||||||
.context("Failed to parse postgres version from the argument string")?;
|
.context("Failed to parse postgres version from the argument string")?;
|
||||||
|
|
||||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||||
@@ -713,10 +484,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
|||||||
Some(("branch", branch_match)) => {
|
Some(("branch", branch_match)) => {
|
||||||
let tenant_id = get_tenant_id(branch_match, env)?;
|
let tenant_id = get_tenant_id(branch_match, env)?;
|
||||||
let new_branch_name = branch_match
|
let new_branch_name = branch_match
|
||||||
.value_of("branch-name")
|
.get_one::<String>("branch-name")
|
||||||
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
||||||
let ancestor_branch_name = branch_match
|
let ancestor_branch_name = branch_match
|
||||||
.value_of("ancestor-branch-name")
|
.get_one::<String>("ancestor-branch-name")
|
||||||
|
.map(|s| s.as_str())
|
||||||
.unwrap_or(DEFAULT_BRANCH_NAME);
|
.unwrap_or(DEFAULT_BRANCH_NAME);
|
||||||
let ancestor_timeline_id = env
|
let ancestor_timeline_id = env
|
||||||
.get_branch_timeline_id(ancestor_branch_name, tenant_id)
|
.get_branch_timeline_id(ancestor_branch_name, tenant_id)
|
||||||
@@ -725,8 +497,8 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
|||||||
})?;
|
})?;
|
||||||
|
|
||||||
let start_lsn = branch_match
|
let start_lsn = branch_match
|
||||||
.value_of("ancestor-start-lsn")
|
.get_one::<String>("ancestor-start-lsn")
|
||||||
.map(Lsn::from_str)
|
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse ancestor start Lsn from the request")?;
|
.context("Failed to parse ancestor start Lsn from the request")?;
|
||||||
let timeline_info = pageserver.timeline_create(
|
let timeline_info = pageserver.timeline_create(
|
||||||
@@ -738,10 +510,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
|||||||
)?;
|
)?;
|
||||||
let new_timeline_id = timeline_info.timeline_id;
|
let new_timeline_id = timeline_info.timeline_id;
|
||||||
|
|
||||||
let last_record_lsn = timeline_info
|
let last_record_lsn = timeline_info.last_record_lsn;
|
||||||
.local
|
|
||||||
.expect("no local timeline info")
|
|
||||||
.last_record_lsn;
|
|
||||||
|
|
||||||
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
|
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
|
||||||
|
|
||||||
@@ -801,7 +570,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
|||||||
// Use the LSN at the end of the timeline.
|
// Use the LSN at the end of the timeline.
|
||||||
timeline_infos
|
timeline_infos
|
||||||
.get(&node.timeline_id)
|
.get(&node.timeline_id)
|
||||||
.and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string()))
|
.map(|bi| bi.last_record_lsn.to_string())
|
||||||
.unwrap_or_else(|| "?".to_string())
|
.unwrap_or_else(|| "?".to_string())
|
||||||
}
|
}
|
||||||
Some(lsn) => {
|
Some(lsn) => {
|
||||||
@@ -830,45 +599,39 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
|||||||
}
|
}
|
||||||
"create" => {
|
"create" => {
|
||||||
let branch_name = sub_args
|
let branch_name = sub_args
|
||||||
.value_of("branch-name")
|
.get_one::<String>("branch-name")
|
||||||
|
.map(|s| s.as_str())
|
||||||
.unwrap_or(DEFAULT_BRANCH_NAME);
|
.unwrap_or(DEFAULT_BRANCH_NAME);
|
||||||
let node_name = sub_args
|
let node_name = sub_args
|
||||||
.value_of("node")
|
.get_one::<String>("node")
|
||||||
.map(ToString::to_string)
|
.map(|node_name| node_name.to_string())
|
||||||
.unwrap_or_else(|| format!("{}_node", branch_name));
|
.unwrap_or_else(|| format!("{branch_name}_node"));
|
||||||
|
|
||||||
let lsn = sub_args
|
let lsn = sub_args
|
||||||
.value_of("lsn")
|
.get_one::<String>("lsn")
|
||||||
.map(Lsn::from_str)
|
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse Lsn from the request")?;
|
.context("Failed to parse Lsn from the request")?;
|
||||||
let timeline_id = env
|
let timeline_id = env
|
||||||
.get_branch_timeline_id(branch_name, tenant_id)
|
.get_branch_timeline_id(branch_name, tenant_id)
|
||||||
.ok_or_else(|| anyhow!("Found no timeline id for branch name '{}'", branch_name))?;
|
.ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;
|
||||||
|
|
||||||
let port: Option<u16> = match sub_args.value_of("port") {
|
let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
|
||||||
Some(p) => Some(p.parse()?),
|
|
||||||
None => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
let pg_version = sub_args
|
let pg_version = sub_args
|
||||||
.value_of("pg-version")
|
.get_one::<u32>("pg-version")
|
||||||
.unwrap()
|
.copied()
|
||||||
.parse::<u32>()
|
|
||||||
.context("Failed to parse postgres version from the argument string")?;
|
.context("Failed to parse postgres version from the argument string")?;
|
||||||
|
|
||||||
cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?;
|
cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?;
|
||||||
}
|
}
|
||||||
"start" => {
|
"start" => {
|
||||||
let port: Option<u16> = match sub_args.value_of("port") {
|
let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
|
||||||
Some(p) => Some(p.parse()?),
|
|
||||||
None => None,
|
|
||||||
};
|
|
||||||
let node_name = sub_args
|
let node_name = sub_args
|
||||||
.value_of("node")
|
.get_one::<String>("node")
|
||||||
.ok_or_else(|| anyhow!("No node name was provided to start"))?;
|
.ok_or_else(|| anyhow!("No node name was provided to start"))?;
|
||||||
|
|
||||||
let node = cplane.nodes.get(&(tenant_id, node_name.to_owned()));
|
let node = cplane.nodes.get(&(tenant_id, node_name.to_string()));
|
||||||
|
|
||||||
let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) {
|
let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) {
|
||||||
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
|
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
|
||||||
@@ -879,36 +642,33 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if let Some(node) = node {
|
if let Some(node) = node {
|
||||||
println!("Starting existing postgres {}...", node_name);
|
println!("Starting existing postgres {node_name}...");
|
||||||
node.start(&auth_token)?;
|
node.start(&auth_token)?;
|
||||||
} else {
|
} else {
|
||||||
let branch_name = sub_args
|
let branch_name = sub_args
|
||||||
.value_of("branch-name")
|
.get_one::<String>("branch-name")
|
||||||
|
.map(|s| s.as_str())
|
||||||
.unwrap_or(DEFAULT_BRANCH_NAME);
|
.unwrap_or(DEFAULT_BRANCH_NAME);
|
||||||
let timeline_id = env
|
let timeline_id = env
|
||||||
.get_branch_timeline_id(branch_name, tenant_id)
|
.get_branch_timeline_id(branch_name, tenant_id)
|
||||||
.ok_or_else(|| {
|
.ok_or_else(|| {
|
||||||
anyhow!("Found no timeline id for branch name '{}'", branch_name)
|
anyhow!("Found no timeline id for branch name '{branch_name}'")
|
||||||
})?;
|
})?;
|
||||||
let lsn = sub_args
|
let lsn = sub_args
|
||||||
.value_of("lsn")
|
.get_one::<String>("lsn")
|
||||||
.map(Lsn::from_str)
|
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse Lsn from the request")?;
|
.context("Failed to parse Lsn from the request")?;
|
||||||
let pg_version = sub_args
|
let pg_version = sub_args
|
||||||
.value_of("pg-version")
|
.get_one::<u32>("pg-version")
|
||||||
.unwrap()
|
.copied()
|
||||||
.parse::<u32>()
|
.context("Failed to `pg-version` from the argument string")?;
|
||||||
.context("Failed to parse postgres version from the argument string")?;
|
|
||||||
// when used with custom port this results in non obvious behaviour
|
// when used with custom port this results in non obvious behaviour
|
||||||
// port is remembered from first start command, i e
|
// port is remembered from first start command, i e
|
||||||
// start --port X
|
// start --port X
|
||||||
// stop
|
// stop
|
||||||
// start <-- will also use port X even without explicit port argument
|
// start <-- will also use port X even without explicit port argument
|
||||||
println!(
|
println!("Starting new postgres (v{pg_version}) {node_name} on timeline {timeline_id} ...");
|
||||||
"Starting new postgres (v{}) {} on timeline {} ...",
|
|
||||||
pg_version, node_name, timeline_id
|
|
||||||
);
|
|
||||||
|
|
||||||
let node =
|
let node =
|
||||||
cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?;
|
cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?;
|
||||||
@@ -917,18 +677,18 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
|||||||
}
|
}
|
||||||
"stop" => {
|
"stop" => {
|
||||||
let node_name = sub_args
|
let node_name = sub_args
|
||||||
.value_of("node")
|
.get_one::<String>("node")
|
||||||
.ok_or_else(|| anyhow!("No node name was provided to stop"))?;
|
.ok_or_else(|| anyhow!("No node name was provided to stop"))?;
|
||||||
let destroy = sub_args.is_present("destroy");
|
let destroy = sub_args.get_flag("destroy");
|
||||||
|
|
||||||
let node = cplane
|
let node = cplane
|
||||||
.nodes
|
.nodes
|
||||||
.get(&(tenant_id, node_name.to_owned()))
|
.get(&(tenant_id, node_name.to_string()))
|
||||||
.with_context(|| format!("postgres {} is not found", node_name))?;
|
.with_context(|| format!("postgres {node_name} is not found"))?;
|
||||||
node.stop(destroy)?;
|
node.stop(destroy)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
_ => bail!("Unexpected pg subcommand '{}'", sub_name),
|
_ => bail!("Unexpected pg subcommand '{sub_name}'"),
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -946,7 +706,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
|||||||
}
|
}
|
||||||
|
|
||||||
Some(("stop", stop_match)) => {
|
Some(("stop", stop_match)) => {
|
||||||
let immediate = stop_match.value_of("stop-mode") == Some("immediate");
|
let immediate = stop_match
|
||||||
|
.get_one::<String>("stop-mode")
|
||||||
|
.map(|s| s.as_str())
|
||||||
|
== Some("immediate");
|
||||||
|
|
||||||
if let Err(e) = pageserver.stop(immediate) {
|
if let Err(e) = pageserver.stop(immediate) {
|
||||||
eprintln!("pageserver stop failed: {}", e);
|
eprintln!("pageserver stop failed: {}", e);
|
||||||
@@ -996,7 +759,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
|||||||
};
|
};
|
||||||
|
|
||||||
// All the commands take an optional safekeeper name argument
|
// All the commands take an optional safekeeper name argument
|
||||||
let sk_id = if let Some(id_str) = sub_args.value_of("id") {
|
let sk_id = if let Some(id_str) = sub_args.get_one::<String>("id") {
|
||||||
NodeId(id_str.parse().context("while parsing safekeeper id")?)
|
NodeId(id_str.parse().context("while parsing safekeeper id")?)
|
||||||
} else {
|
} else {
|
||||||
DEFAULT_SAFEKEEPER_ID
|
DEFAULT_SAFEKEEPER_ID
|
||||||
@@ -1012,7 +775,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
|||||||
}
|
}
|
||||||
|
|
||||||
"stop" => {
|
"stop" => {
|
||||||
let immediate = sub_args.value_of("stop-mode") == Some("immediate");
|
let immediate =
|
||||||
|
sub_args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
||||||
|
|
||||||
if let Err(e) = safekeeper.stop(immediate) {
|
if let Err(e) = safekeeper.stop(immediate) {
|
||||||
eprintln!("safekeeper stop failed: {}", e);
|
eprintln!("safekeeper stop failed: {}", e);
|
||||||
@@ -1021,7 +785,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
|||||||
}
|
}
|
||||||
|
|
||||||
"restart" => {
|
"restart" => {
|
||||||
let immediate = sub_args.value_of("stop-mode") == Some("immediate");
|
let immediate =
|
||||||
|
sub_args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
||||||
|
|
||||||
if let Err(e) = safekeeper.stop(immediate) {
|
if let Err(e) = safekeeper.stop(immediate) {
|
||||||
eprintln!("safekeeper stop failed: {}", e);
|
eprintln!("safekeeper stop failed: {}", e);
|
||||||
@@ -1065,7 +830,8 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||||
let immediate = sub_match.value_of("stop-mode") == Some("immediate");
|
let immediate =
|
||||||
|
sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
||||||
|
|
||||||
let pageserver = PageServerNode::from_env(env);
|
let pageserver = PageServerNode::from_env(env);
|
||||||
|
|
||||||
@@ -1098,3 +864,219 @@ fn try_stop_etcd_process(env: &local_env::LocalEnv) {
|
|||||||
eprintln!("etcd stop failed: {e}");
|
eprintln!("etcd stop failed: {e}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn cli() -> Command {
|
||||||
|
let branch_name_arg = Arg::new("branch-name")
|
||||||
|
.long("branch-name")
|
||||||
|
.help("Name of the branch to be created or used as an alias for other services")
|
||||||
|
.required(false);
|
||||||
|
|
||||||
|
let pg_node_arg = Arg::new("node").help("Postgres node name").required(false);
|
||||||
|
|
||||||
|
let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
|
||||||
|
|
||||||
|
let tenant_id_arg = Arg::new("tenant-id")
|
||||||
|
.long("tenant-id")
|
||||||
|
.help("Tenant id. Represented as a hexadecimal string 32 symbols length")
|
||||||
|
.required(false);
|
||||||
|
|
||||||
|
let timeline_id_arg = Arg::new("timeline-id")
|
||||||
|
.long("timeline-id")
|
||||||
|
.help("Timeline id. Represented as a hexadecimal string 32 symbols length")
|
||||||
|
.required(false);
|
||||||
|
|
||||||
|
let pg_version_arg = Arg::new("pg-version")
|
||||||
|
.long("pg-version")
|
||||||
|
.help("Postgres version to use for the initial tenant")
|
||||||
|
.required(false)
|
||||||
|
.value_parser(value_parser!(u32))
|
||||||
|
.default_value(DEFAULT_PG_VERSION);
|
||||||
|
|
||||||
|
let port_arg = Arg::new("port")
|
||||||
|
.long("port")
|
||||||
|
.required(false)
|
||||||
|
.value_parser(value_parser!(u16))
|
||||||
|
.value_name("port");
|
||||||
|
|
||||||
|
let stop_mode_arg = Arg::new("stop-mode")
|
||||||
|
.short('m')
|
||||||
|
.value_parser(["fast", "immediate"])
|
||||||
|
.help("If 'immediate', don't flush repository data at shutdown")
|
||||||
|
.required(false)
|
||||||
|
.value_name("stop-mode");
|
||||||
|
|
||||||
|
let pageserver_config_args = Arg::new("pageserver-config-override")
|
||||||
|
.long("pageserver-config-override")
|
||||||
|
.num_args(1)
|
||||||
|
.action(ArgAction::Append)
|
||||||
|
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
|
||||||
|
.required(false);
|
||||||
|
|
||||||
|
let lsn_arg = Arg::new("lsn")
|
||||||
|
.long("lsn")
|
||||||
|
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
|
||||||
|
.required(false);
|
||||||
|
|
||||||
|
Command::new("Neon CLI")
|
||||||
|
.arg_required_else_help(true)
|
||||||
|
.version(GIT_VERSION)
|
||||||
|
.subcommand(
|
||||||
|
Command::new("init")
|
||||||
|
.about("Initialize a new Neon repository")
|
||||||
|
.arg(pageserver_config_args.clone())
|
||||||
|
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
|
||||||
|
.arg(
|
||||||
|
Arg::new("config")
|
||||||
|
.long("config")
|
||||||
|
.required(false)
|
||||||
|
.value_parser(value_parser!(PathBuf))
|
||||||
|
.value_name("config"),
|
||||||
|
)
|
||||||
|
.arg(pg_version_arg.clone())
|
||||||
|
)
|
||||||
|
.subcommand(
|
||||||
|
Command::new("timeline")
|
||||||
|
.about("Manage timelines")
|
||||||
|
.subcommand(Command::new("list")
|
||||||
|
.about("List all timelines, available to this pageserver")
|
||||||
|
.arg(tenant_id_arg.clone()))
|
||||||
|
.subcommand(Command::new("branch")
|
||||||
|
.about("Create a new timeline, using another timeline as a base, copying its data")
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(branch_name_arg.clone())
|
||||||
|
.arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
|
||||||
|
.help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
|
||||||
|
.arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn")
|
||||||
|
.help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false)))
|
||||||
|
.subcommand(Command::new("create")
|
||||||
|
.about("Create a new blank timeline")
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(branch_name_arg.clone())
|
||||||
|
.arg(pg_version_arg.clone())
|
||||||
|
)
|
||||||
|
.subcommand(Command::new("import")
|
||||||
|
.about("Import timeline from basebackup directory")
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(timeline_id_arg.clone())
|
||||||
|
.arg(Arg::new("node-name").long("node-name")
|
||||||
|
.help("Name to assign to the imported timeline"))
|
||||||
|
.arg(Arg::new("base-tarfile")
|
||||||
|
.long("base-tarfile")
|
||||||
|
.value_parser(value_parser!(PathBuf))
|
||||||
|
.help("Basebackup tarfile to import")
|
||||||
|
)
|
||||||
|
.arg(Arg::new("base-lsn").long("base-lsn")
|
||||||
|
.help("Lsn the basebackup starts at"))
|
||||||
|
.arg(Arg::new("wal-tarfile")
|
||||||
|
.long("wal-tarfile")
|
||||||
|
.value_parser(value_parser!(PathBuf))
|
||||||
|
.help("Wal to add after base")
|
||||||
|
)
|
||||||
|
.arg(Arg::new("end-lsn").long("end-lsn")
|
||||||
|
.help("Lsn the basebackup ends at"))
|
||||||
|
.arg(pg_version_arg.clone())
|
||||||
|
)
|
||||||
|
).subcommand(
|
||||||
|
Command::new("tenant")
|
||||||
|
.arg_required_else_help(true)
|
||||||
|
.about("Manage tenants")
|
||||||
|
.subcommand(Command::new("list"))
|
||||||
|
.subcommand(Command::new("create")
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
|
||||||
|
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
|
||||||
|
.arg(pg_version_arg.clone())
|
||||||
|
)
|
||||||
|
.subcommand(Command::new("config")
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.subcommand(
|
||||||
|
Command::new("pageserver")
|
||||||
|
.arg_required_else_help(true)
|
||||||
|
.about("Manage pageserver")
|
||||||
|
.subcommand(Command::new("status"))
|
||||||
|
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
||||||
|
.subcommand(Command::new("stop").about("Stop local pageserver")
|
||||||
|
.arg(stop_mode_arg.clone()))
|
||||||
|
.subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
|
||||||
|
)
|
||||||
|
.subcommand(
|
||||||
|
Command::new("safekeeper")
|
||||||
|
.arg_required_else_help(true)
|
||||||
|
.about("Manage safekeepers")
|
||||||
|
.subcommand(Command::new("start")
|
||||||
|
.about("Start local safekeeper")
|
||||||
|
.arg(safekeeper_id_arg.clone())
|
||||||
|
)
|
||||||
|
.subcommand(Command::new("stop")
|
||||||
|
.about("Stop local safekeeper")
|
||||||
|
.arg(safekeeper_id_arg.clone())
|
||||||
|
.arg(stop_mode_arg.clone())
|
||||||
|
)
|
||||||
|
.subcommand(Command::new("restart")
|
||||||
|
.about("Restart local safekeeper")
|
||||||
|
.arg(safekeeper_id_arg)
|
||||||
|
.arg(stop_mode_arg.clone())
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.subcommand(
|
||||||
|
Command::new("pg")
|
||||||
|
.arg_required_else_help(true)
|
||||||
|
.about("Manage postgres instances")
|
||||||
|
.subcommand(Command::new("list").arg(tenant_id_arg.clone()))
|
||||||
|
.subcommand(Command::new("create")
|
||||||
|
.about("Create a postgres compute node")
|
||||||
|
.arg(pg_node_arg.clone())
|
||||||
|
.arg(branch_name_arg.clone())
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(lsn_arg.clone())
|
||||||
|
.arg(port_arg.clone())
|
||||||
|
.arg(
|
||||||
|
Arg::new("config-only")
|
||||||
|
.help("Don't do basebackup, create compute node with only config files")
|
||||||
|
.long("config-only")
|
||||||
|
.required(false))
|
||||||
|
.arg(pg_version_arg.clone())
|
||||||
|
)
|
||||||
|
.subcommand(Command::new("start")
|
||||||
|
.about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
|
||||||
|
.arg(pg_node_arg.clone())
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(branch_name_arg)
|
||||||
|
.arg(timeline_id_arg)
|
||||||
|
.arg(lsn_arg)
|
||||||
|
.arg(port_arg)
|
||||||
|
.arg(pg_version_arg)
|
||||||
|
)
|
||||||
|
.subcommand(
|
||||||
|
Command::new("stop")
|
||||||
|
.arg(pg_node_arg)
|
||||||
|
.arg(tenant_id_arg)
|
||||||
|
.arg(
|
||||||
|
Arg::new("destroy")
|
||||||
|
.help("Also delete data directory (now optional, should be default in future)")
|
||||||
|
.long("destroy")
|
||||||
|
.action(ArgAction::SetTrue)
|
||||||
|
.required(false)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
)
|
||||||
|
.subcommand(
|
||||||
|
Command::new("start")
|
||||||
|
.about("Start page server and safekeepers")
|
||||||
|
.arg(pageserver_config_args)
|
||||||
|
)
|
||||||
|
.subcommand(
|
||||||
|
Command::new("stop")
|
||||||
|
.about("Stop page server and safekeepers")
|
||||||
|
.arg(stop_mode_arg)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn verify_cli() {
|
||||||
|
cli().debug_assert();
|
||||||
|
}
|
||||||
|
|||||||
@@ -284,7 +284,7 @@ impl PostgresNode {
|
|||||||
conf.append("max_wal_senders", "10");
|
conf.append("max_wal_senders", "10");
|
||||||
// wal_log_hints is mandatory when running against pageserver (see gh issue#192)
|
// wal_log_hints is mandatory when running against pageserver (see gh issue#192)
|
||||||
// TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
|
// TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
|
||||||
conf.append("wal_log_hints", "off");
|
conf.append("wal_log_hints", "on");
|
||||||
conf.append("max_replication_slots", "10");
|
conf.append("max_replication_slots", "10");
|
||||||
conf.append("hot_standby", "on");
|
conf.append("hot_standby", "on");
|
||||||
conf.append("shared_buffers", "1MB");
|
conf.append("shared_buffers", "1MB");
|
||||||
|
|||||||
@@ -12,13 +12,8 @@ use nix::unistd::Pid;
|
|||||||
use postgres::Config;
|
use postgres::Config;
|
||||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||||
use reqwest::{IntoUrl, Method};
|
use reqwest::{IntoUrl, Method};
|
||||||
use safekeeper::http::models::TimelineCreateRequest;
|
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use utils::{
|
use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};
|
||||||
connstring::connection_address,
|
|
||||||
http::error::HttpErrorBody,
|
|
||||||
id::{NodeId, TenantId, TimelineId},
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||||
use crate::storage::PageServerNode;
|
use crate::storage::PageServerNode;
|
||||||
@@ -281,24 +276,4 @@ impl SafekeeperNode {
|
|||||||
.error_from_body()?;
|
.error_from_body()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timeline_create(
|
|
||||||
&self,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
peer_ids: Vec<NodeId>,
|
|
||||||
) -> Result<()> {
|
|
||||||
Ok(self
|
|
||||||
.http_request(
|
|
||||||
Method::POST,
|
|
||||||
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
|
|
||||||
)
|
|
||||||
.json(&TimelineCreateRequest {
|
|
||||||
timeline_id,
|
|
||||||
peer_ids,
|
|
||||||
})
|
|
||||||
.send()?
|
|
||||||
.error_from_body()?
|
|
||||||
.json()?)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ use anyhow::{bail, Context};
|
|||||||
use nix::errno::Errno;
|
use nix::errno::Errno;
|
||||||
use nix::sys::signal::{kill, Signal};
|
use nix::sys::signal::{kill, Signal};
|
||||||
use nix::unistd::Pid;
|
use nix::unistd::Pid;
|
||||||
use pageserver::http::models::{
|
use pageserver_api::models::{
|
||||||
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||||
};
|
};
|
||||||
use postgres::{Config, NoTls};
|
use postgres::{Config, NoTls};
|
||||||
|
|||||||
163
docs/rfcs/018-storage-messaging-2.md
Normal file
163
docs/rfcs/018-storage-messaging-2.md
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
# Storage messaging
|
||||||
|
|
||||||
|
Safekeepers need to communicate to each other to
|
||||||
|
* Trim WAL on safekeepers;
|
||||||
|
* Decide on which SK should push WAL to the S3;
|
||||||
|
* Decide on when to shut down SK<->pageserver connection;
|
||||||
|
* Understand state of each other to perform peer recovery;
|
||||||
|
|
||||||
|
Pageservers need to communicate to safekeepers to decide which SK should provide
|
||||||
|
WAL to the pageserver.
|
||||||
|
|
||||||
|
This is an iteration on [015-storage-messaging](https://github.com/neondatabase/neon/blob/main/docs/rfcs/015-storage-messaging.md) describing current situation,
|
||||||
|
potential performance issue and ways to address it.
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
What we have currently is very close to etcd variant described in
|
||||||
|
015-storage-messaging. Basically, we have single `SkTimelineInfo` message
|
||||||
|
periodically sent by all safekeepers to etcd for each timeline.
|
||||||
|
* Safekeepers subscribe to it to learn status of peers (currently they subscribe to
|
||||||
|
'everything', but they can and should fetch data only for timelines they hold).
|
||||||
|
* Pageserver subscribes to it (separate watch per timeline) to learn safekeepers
|
||||||
|
positions; based on that, it decides from which safekeepers to pull WAL.
|
||||||
|
|
||||||
|
Also, safekeepers use etcd elections API to make sure only single safekeeper
|
||||||
|
offloads WAL.
|
||||||
|
|
||||||
|
It works, and callmemaybe is gone. However, this has a performance
|
||||||
|
hazard. Currently deployed etcd can do about 6k puts per second (using its own
|
||||||
|
`benchmark` tool); on my 6 core laptop, while running on tmpfs, this gets to
|
||||||
|
35k. Making benchmark closer to our usage [etcd watch bench](https://github.com/arssher/etcd-client/blob/watch-bench/examples/watch_bench.rs),
|
||||||
|
I get ~10k received messages per second with various number of publisher-subscribers
|
||||||
|
(laptop, tmpfs). Diving this by 12 (3 sks generate msg, 1 ps + 3 sk consume them) we
|
||||||
|
get about 800 active timelines, if message is sent each second. Not extremely
|
||||||
|
low, but quite reachable.
|
||||||
|
|
||||||
|
A lot of idle watches seem to be ok though -- which is good, as pageserver
|
||||||
|
subscribes to all its timelines regardless of their activity.
|
||||||
|
|
||||||
|
Also, running etcd with fsyncs disabled is messy -- data dir must be wiped on
|
||||||
|
each restart or there is a risk of corruption errors.
|
||||||
|
|
||||||
|
The reason is etcd making much more than what we need; it is a fault tolerant
|
||||||
|
store with strong consistency, but I claim all we need here is just simplest pub
|
||||||
|
sub with best effort delivery, because
|
||||||
|
* We already have centralized source of truth for long running data, like which
|
||||||
|
tlis are on which nodes -- the console.
|
||||||
|
* Momentary data (safekeeper/pageserver progress) doesn't make sense to persist.
|
||||||
|
Instead of putting each change to broker, expecting it to reliably deliver it
|
||||||
|
is better to just have constant flow of data for active timelines: 1) they
|
||||||
|
serve as natural heartbeats -- if node can't send, we shouldn't pull WAL from
|
||||||
|
it 2) it is simpler -- no need to track delivery to/from the broker.
|
||||||
|
Moreover, latency here is important: the faster we obtain fresh data, the
|
||||||
|
faster we can switch to proper safekeeper after failure.
|
||||||
|
* As for WAL offloading leader election, it is trivial to achieve through these
|
||||||
|
heartbeats -- just take suitable node through deterministic rule (min node
|
||||||
|
id). Once network is stable, this is a converging process (well, except
|
||||||
|
complicated failure topology, but even then making it converge is not
|
||||||
|
hard). Such elections bear some risk of several offloaders running
|
||||||
|
concurrently for a short period of time, but that's harmless.
|
||||||
|
|
||||||
|
Generally, if one needs strong consistency, electing leader per se is not
|
||||||
|
enough; it must be accompanied with number (logical clock ts), checked at
|
||||||
|
every action to track causality. s3 doesn't provide CAS, so it can't
|
||||||
|
differentiate old/new leader, this must be solved differently.
|
||||||
|
|
||||||
|
We could use etcd CAS (its most powerful/useful primitive actually) to issue
|
||||||
|
these leader numbers (and e.g. prefix files in s3), but currently I don't see
|
||||||
|
need for that.
|
||||||
|
|
||||||
|
|
||||||
|
Obviously best effort pub sub is much more simpler and performant; the one proposed is
|
||||||
|
|
||||||
|
## gRPC broker
|
||||||
|
|
||||||
|
I took tonic and [prototyped](https://github.com/neondatabase/neon/blob/asher/neon-broker/broker/src/broker.rs) the replacement of functionality we currently use
|
||||||
|
with grpc streams and tokio mpsc channels. The implementation description is at the file header.
|
||||||
|
|
||||||
|
It is just 500 lines of code and core functionality is complete. 1-1 pub sub
|
||||||
|
gives about 120k received messages per second; having multiple subscribers in
|
||||||
|
different connecitons quickly scales to 1 million received messages per second.
|
||||||
|
I had concerns about many concurrent streams in singe connection, but 2^20
|
||||||
|
subscribers still work (though eat memory, with 10 publishers 20GB are consumed;
|
||||||
|
in this implementation each publisher holds full copy of all subscribers). There
|
||||||
|
is `bench.rs` nearby which I used for testing.
|
||||||
|
|
||||||
|
`SkTimelineInfo` is wired here, but another message can be added (e.g. if
|
||||||
|
pageservers want to communicate with each other) with templating.
|
||||||
|
|
||||||
|
### Fault tolerance
|
||||||
|
|
||||||
|
Since such broker is stateless, we can run it under k8s. Or add proxying to
|
||||||
|
other members, with best-effort this is simple.
|
||||||
|
|
||||||
|
### Security implications
|
||||||
|
|
||||||
|
Communication happens in a private network that is not exposed to users;
|
||||||
|
additionaly we can add auth to the broker.
|
||||||
|
|
||||||
|
## Alternative: get existing pub-sub
|
||||||
|
|
||||||
|
We could take some existing pub sub solution, e.g. RabbitMQ, Redis. But in this
|
||||||
|
case IMV simplicity of our own outweights external dependency costs (RabbitMQ is
|
||||||
|
much more complicated and needs VM; Redis Rust client maintenance is not
|
||||||
|
ideal...). Also note that projects like CockroachDB and TiDB are based on gRPC
|
||||||
|
as well.
|
||||||
|
|
||||||
|
## Alternative: direct communication
|
||||||
|
|
||||||
|
Apart from being transport, broker solves one more task: discovery, i.e. letting
|
||||||
|
safekeepers and pageservers find each other. We can let safekeepers know, for
|
||||||
|
each timeline, both other safekeepers for this timeline and pageservers serving
|
||||||
|
it. In this case direct communication is possible:
|
||||||
|
- each safekeeper pushes to each other safekeeper status of timelines residing
|
||||||
|
on both of them, letting remove WAL, decide who offloads, decide on peer
|
||||||
|
recovery;
|
||||||
|
- each safekeeper pushes to each pageserver status of timelines residing on
|
||||||
|
both of them, letting pageserver choose from which sk to pull WAL;
|
||||||
|
|
||||||
|
It was mostly described in [014-safekeeper-gossip](https://github.com/neondatabase/neon/blob/main/docs/rfcs/014-safekeepers-gossip.md), but I want to recap on that.
|
||||||
|
|
||||||
|
The main pro is less one dependency: less moving parts, easier to run Neon
|
||||||
|
locally/manually, less places to monitor. Fault tolerance for broker disappears,
|
||||||
|
no kuber or something. To me this is a big thing.
|
||||||
|
|
||||||
|
Also (though not a big thing) idle watches for inactive timelines disappear:
|
||||||
|
naturally safekeepers learn about compute connection first and start pushing
|
||||||
|
status to pageserver(s), notifying it should pull.
|
||||||
|
|
||||||
|
Importantly, I think that eventually knowing and persisting peers and
|
||||||
|
pageservers on safekeepers is inevitable:
|
||||||
|
- Knowing peer safekeepers for the timeline is required for correct
|
||||||
|
automatic membership change -- new member set must be hardened on old
|
||||||
|
majority before proceeding. It is required to get rid of sync-safekeepers
|
||||||
|
as well (peer recovery up to flush_lsn).
|
||||||
|
- Knowing pageservers where the timeline is attached is needed to
|
||||||
|
1. Understand when to shut down activity on the timeline, i.e. push data to
|
||||||
|
the broker. We can have a lot of timelines sleeping quietly which
|
||||||
|
shouldn't occupy resources.
|
||||||
|
2. Preserve WAL for these (currently we offload to s3 and take it from there,
|
||||||
|
but serving locally is better, and we get one less condition on which WAL
|
||||||
|
can be removed from s3).
|
||||||
|
|
||||||
|
I suppose this membership data should be passed to safekeepers directly from the
|
||||||
|
console because
|
||||||
|
1. Console is the original source of this data, conceptually this is the
|
||||||
|
simplest way (rather than passing it through compute or something).
|
||||||
|
2. We already have similar code for deleting timeline on safekeepers
|
||||||
|
(and attaching/detaching timeline on pageserver), this is a typical
|
||||||
|
action -- queue operation against storage node and execute it until it
|
||||||
|
completes (or timeline is dropped).
|
||||||
|
|
||||||
|
Cons of direct communication are
|
||||||
|
- It is more complicated: each safekeeper should maintain set of peers it talks
|
||||||
|
to, and set of timelines for each such peer -- they ought to be multiplexed
|
||||||
|
into single connection.
|
||||||
|
- Totally, we have O(n^2) connections instead of O(n) with broker schema
|
||||||
|
(still O(n) on each node). However, these are relatively stable, async and
|
||||||
|
thus not very expensive, I don't think this is a big problem. Up to 10k
|
||||||
|
storage nodes I doubt connection overhead would be noticeable.
|
||||||
|
|
||||||
|
I'd use gRPC for direct communication, and in this sense gRPC based broker is a
|
||||||
|
step towards it.
|
||||||
@@ -8,7 +8,7 @@
|
|||||||
regex = "1.4.5"
|
regex = "1.4.5"
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
serde_with = "1.12.0"
|
serde_with = "2.0"
|
||||||
once_cell = "1.13.0"
|
once_cell = "1.13.0"
|
||||||
|
|
||||||
utils = { path = "../utils" }
|
utils = { path = "../utils" }
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
//! Otherwise, we might not see all metrics registered via
|
//! Otherwise, we might not see all metrics registered via
|
||||||
//! a default registry.
|
//! a default registry.
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec};
|
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
|
||||||
pub use prometheus::opts;
|
pub use prometheus::opts;
|
||||||
pub use prometheus::register;
|
pub use prometheus::register;
|
||||||
pub use prometheus::{core, default_registry, proto};
|
pub use prometheus::{core, default_registry, proto};
|
||||||
@@ -17,6 +17,7 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
|
|||||||
pub use prometheus::{register_int_gauge, IntGauge};
|
pub use prometheus::{register_int_gauge, IntGauge};
|
||||||
pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
|
pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
|
||||||
pub use prometheus::{Encoder, TextEncoder};
|
pub use prometheus::{Encoder, TextEncoder};
|
||||||
|
use prometheus::{Registry, Result};
|
||||||
|
|
||||||
mod wrappers;
|
mod wrappers;
|
||||||
pub use wrappers::{CountedReader, CountedWriter};
|
pub use wrappers::{CountedReader, CountedWriter};
|
||||||
@@ -32,13 +33,27 @@ macro_rules! register_uint_gauge_vec {
|
|||||||
}};
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Special internal registry, to collect metrics independently from the default registry.
|
||||||
|
/// Was introduced to fix deadlock with lazy registration of metrics in the default registry.
|
||||||
|
static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
|
||||||
|
|
||||||
|
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
|
||||||
|
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
|
||||||
|
/// while holding the lock.
|
||||||
|
pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
|
||||||
|
INTERNAL_REGISTRY.register(c)
|
||||||
|
}
|
||||||
|
|
||||||
/// Gathers all Prometheus metrics and records the I/O stats just before that.
|
/// Gathers all Prometheus metrics and records the I/O stats just before that.
|
||||||
///
|
///
|
||||||
/// Metrics gathering is a relatively simple and standalone operation, so
|
/// Metrics gathering is a relatively simple and standalone operation, so
|
||||||
/// it might be fine to do it this way to keep things simple.
|
/// it might be fine to do it this way to keep things simple.
|
||||||
pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
|
pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
|
||||||
update_rusage_metrics();
|
update_rusage_metrics();
|
||||||
prometheus::gather()
|
let mut mfs = prometheus::gather();
|
||||||
|
let mut internal_mfs = INTERNAL_REGISTRY.gather();
|
||||||
|
mfs.append(&mut internal_mfs);
|
||||||
|
mfs
|
||||||
}
|
}
|
||||||
|
|
||||||
static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
|
static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||||
@@ -62,6 +77,16 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
|
|||||||
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
||||||
];
|
];
|
||||||
|
|
||||||
|
pub fn set_build_info_metric(revision: &str) {
|
||||||
|
let metric = register_int_gauge_vec!(
|
||||||
|
"libmetrics_build_info",
|
||||||
|
"Build/version information",
|
||||||
|
&["revision"]
|
||||||
|
)
|
||||||
|
.expect("Failed to register build info metric");
|
||||||
|
metric.with_label_values(&[revision]).set(1);
|
||||||
|
}
|
||||||
|
|
||||||
// Records I/O stats in a "cross-platform" way.
|
// Records I/O stats in a "cross-platform" way.
|
||||||
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
|
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
|
||||||
// An alternative is to read procfs (`/proc/[pid]/io`) which does not work under macOS at all, hence abandoned.
|
// An alternative is to read procfs (`/proc/[pid]/io`) which does not work under macOS at all, hence abandoned.
|
||||||
|
|||||||
12
libs/pageserver_api/Cargo.toml
Normal file
12
libs/pageserver_api/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
[package]
|
||||||
|
name = "pageserver_api"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
serde_with = "2.0"
|
||||||
|
const_format = "0.2.21"
|
||||||
|
|
||||||
|
utils = { path = "../utils" }
|
||||||
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
9
libs/pageserver_api/src/lib.rs
Normal file
9
libs/pageserver_api/src/lib.rs
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
use const_format::formatcp;
|
||||||
|
|
||||||
|
/// Public API types
|
||||||
|
pub mod models;
|
||||||
|
|
||||||
|
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||||
|
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||||
|
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||||
|
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||||
@@ -7,7 +7,17 @@ use utils::{
|
|||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::tenant::TenantState;
|
/// A state of a tenant in pageserver's memory.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||||
|
pub enum TenantState {
|
||||||
|
/// Tenant is fully operational, its background jobs might be running or not.
|
||||||
|
Active { background_jobs_running: bool },
|
||||||
|
/// A tenant is recognized by pageserver, but not yet ready to operate:
|
||||||
|
/// e.g. not present locally and being downloaded or being read into memory from the file system.
|
||||||
|
Paused,
|
||||||
|
/// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated.
|
||||||
|
Broken,
|
||||||
|
}
|
||||||
|
|
||||||
#[serde_as]
|
#[serde_as]
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
@@ -113,9 +123,15 @@ pub struct TenantInfo {
|
|||||||
pub has_in_progress_downloads: Option<bool>,
|
pub has_in_progress_downloads: Option<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||||
#[serde_as]
|
#[serde_as]
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
pub struct LocalTimelineInfo {
|
pub struct TimelineInfo {
|
||||||
|
#[serde_as(as = "DisplayFromStr")]
|
||||||
|
pub tenant_id: TenantId,
|
||||||
|
#[serde_as(as = "DisplayFromStr")]
|
||||||
|
pub timeline_id: TimelineId,
|
||||||
|
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
pub ancestor_timeline_id: Option<TimelineId>,
|
pub ancestor_timeline_id: Option<TimelineId>,
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
@@ -139,28 +155,33 @@ pub struct LocalTimelineInfo {
|
|||||||
/// the timestamp (in microseconds) of the last received message
|
/// the timestamp (in microseconds) of the last received message
|
||||||
pub last_received_msg_ts: Option<u128>,
|
pub last_received_msg_ts: Option<u128>,
|
||||||
pub pg_version: u32,
|
pub pg_version: u32,
|
||||||
|
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
pub remote_consistent_lsn: Option<Lsn>,
|
||||||
|
pub awaits_download: bool,
|
||||||
|
|
||||||
|
// Some of the above fields are duplicated in 'local' and 'remote', for backwards-
|
||||||
|
// compatility with older clients.
|
||||||
|
pub local: LocalTimelineInfo,
|
||||||
|
pub remote: RemoteTimelineInfo,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[serde_as]
|
||||||
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
|
pub struct LocalTimelineInfo {
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
pub ancestor_timeline_id: Option<TimelineId>,
|
||||||
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
|
pub ancestor_lsn: Option<Lsn>,
|
||||||
|
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
|
||||||
|
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||||
}
|
}
|
||||||
|
|
||||||
#[serde_as]
|
#[serde_as]
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
pub struct RemoteTimelineInfo {
|
pub struct RemoteTimelineInfo {
|
||||||
#[serde_as(as = "DisplayFromStr")]
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
pub remote_consistent_lsn: Lsn,
|
pub remote_consistent_lsn: Option<Lsn>,
|
||||||
pub awaits_download: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
/// This represents the output of the "timeline_detail" API call.
|
|
||||||
///
|
|
||||||
#[serde_as]
|
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
|
||||||
pub struct TimelineInfo {
|
|
||||||
#[serde_as(as = "DisplayFromStr")]
|
|
||||||
pub tenant_id: TenantId,
|
|
||||||
#[serde_as(as = "DisplayFromStr")]
|
|
||||||
pub timeline_id: TimelineId,
|
|
||||||
pub local: Option<LocalTimelineInfo>,
|
|
||||||
pub remote: Option<RemoteTimelineInfo>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||||
@@ -13,7 +13,7 @@ crc32c = "0.6.0"
|
|||||||
hex = "0.4.3"
|
hex = "0.4.3"
|
||||||
once_cell = "1.13.0"
|
once_cell = "1.13.0"
|
||||||
log = "0.4.14"
|
log = "0.4.14"
|
||||||
memoffset = "0.6.2"
|
memoffset = "0.7"
|
||||||
thiserror = "1.0"
|
thiserror = "1.0"
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
utils = { path = "../utils" }
|
utils = { path = "../utils" }
|
||||||
@@ -26,4 +26,4 @@ wal_craft = { path = "wal_craft" }
|
|||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
anyhow = "1.0"
|
anyhow = "1.0"
|
||||||
bindgen = "0.60.1"
|
bindgen = "0.61"
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ edition = "2021"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0"
|
anyhow = "1.0"
|
||||||
clap = "3.0"
|
clap = "4.0"
|
||||||
env_logger = "0.9"
|
env_logger = "0.9"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
once_cell = "1.13.0"
|
once_cell = "1.13.0"
|
||||||
|
|||||||
@@ -1,68 +1,19 @@
|
|||||||
use anyhow::*;
|
use anyhow::*;
|
||||||
use clap::{App, Arg, ArgMatches};
|
use clap::{value_parser, Arg, ArgMatches, Command};
|
||||||
use std::str::FromStr;
|
use std::{path::PathBuf, str::FromStr};
|
||||||
use wal_craft::*;
|
use wal_craft::*;
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info"))
|
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info"))
|
||||||
.init();
|
.init();
|
||||||
let type_arg = &Arg::new("type")
|
let arg_matches = cli().get_matches();
|
||||||
.takes_value(true)
|
|
||||||
.help("Type of WAL to craft")
|
|
||||||
.possible_values([
|
|
||||||
Simple::NAME,
|
|
||||||
LastWalRecordXlogSwitch::NAME,
|
|
||||||
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME,
|
|
||||||
WalRecordCrossingSegmentFollowedBySmallOne::NAME,
|
|
||||||
LastWalRecordCrossingSegment::NAME,
|
|
||||||
])
|
|
||||||
.required(true);
|
|
||||||
let arg_matches = App::new("Postgres WAL crafter")
|
|
||||||
.about("Crafts Postgres databases with specific WAL properties")
|
|
||||||
.subcommand(
|
|
||||||
App::new("print-postgres-config")
|
|
||||||
.about("Print the configuration required for PostgreSQL server before running this script")
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
App::new("with-initdb")
|
|
||||||
.about("Craft WAL in a new data directory first initialized with initdb")
|
|
||||||
.arg(type_arg)
|
|
||||||
.arg(
|
|
||||||
Arg::new("datadir")
|
|
||||||
.takes_value(true)
|
|
||||||
.help("Data directory for the Postgres server")
|
|
||||||
.required(true)
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("pg-distrib-dir")
|
|
||||||
.long("pg-distrib-dir")
|
|
||||||
.takes_value(true)
|
|
||||||
.help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)")
|
|
||||||
.default_value("/usr/local")
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("pg-version")
|
|
||||||
.long("pg-version")
|
|
||||||
.help("Postgres version to use for the initial tenant")
|
|
||||||
.required(true)
|
|
||||||
.takes_value(true)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
App::new("in-existing")
|
|
||||||
.about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.")
|
|
||||||
.arg(type_arg)
|
|
||||||
.arg(
|
|
||||||
Arg::new("connection")
|
|
||||||
.takes_value(true)
|
|
||||||
.help("Connection string to the Postgres database to populate")
|
|
||||||
.required(true)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
.get_matches();
|
|
||||||
|
|
||||||
let wal_craft = |arg_matches: &ArgMatches, client| {
|
let wal_craft = |arg_matches: &ArgMatches, client| {
|
||||||
let (intermediate_lsns, end_of_wal_lsn) = match arg_matches.value_of("type").unwrap() {
|
let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
|
||||||
|
.get_one::<String>("type")
|
||||||
|
.map(|s| s.as_str())
|
||||||
|
.context("'type' is required")?
|
||||||
|
{
|
||||||
Simple::NAME => Simple::craft(client)?,
|
Simple::NAME => Simple::craft(client)?,
|
||||||
LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?,
|
LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?,
|
||||||
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => {
|
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => {
|
||||||
@@ -72,12 +23,12 @@ fn main() -> Result<()> {
|
|||||||
WalRecordCrossingSegmentFollowedBySmallOne::craft(client)?
|
WalRecordCrossingSegmentFollowedBySmallOne::craft(client)?
|
||||||
}
|
}
|
||||||
LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
|
LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
|
||||||
a => panic!("Unknown --type argument: {}", a),
|
a => panic!("Unknown --type argument: {a}"),
|
||||||
};
|
};
|
||||||
for lsn in intermediate_lsns {
|
for lsn in intermediate_lsns {
|
||||||
println!("intermediate_lsn = {}", lsn);
|
println!("intermediate_lsn = {lsn}");
|
||||||
}
|
}
|
||||||
println!("end_of_wal = {}", end_of_wal_lsn);
|
println!("end_of_wal = {end_of_wal_lsn}");
|
||||||
Ok(())
|
Ok(())
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -85,20 +36,24 @@ fn main() -> Result<()> {
|
|||||||
None => panic!("No subcommand provided"),
|
None => panic!("No subcommand provided"),
|
||||||
Some(("print-postgres-config", _)) => {
|
Some(("print-postgres-config", _)) => {
|
||||||
for cfg in REQUIRED_POSTGRES_CONFIG.iter() {
|
for cfg in REQUIRED_POSTGRES_CONFIG.iter() {
|
||||||
println!("{}", cfg);
|
println!("{cfg}");
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
Some(("with-initdb", arg_matches)) => {
|
Some(("with-initdb", arg_matches)) => {
|
||||||
let cfg = Conf {
|
let cfg = Conf {
|
||||||
pg_version: arg_matches
|
pg_version: *arg_matches
|
||||||
.value_of("pg-version")
|
.get_one::<u32>("pg-version")
|
||||||
.unwrap()
|
.context("'pg-version' is required")?,
|
||||||
.parse::<u32>()
|
pg_distrib_dir: arg_matches
|
||||||
.context("Failed to parse postgres version from the argument string")?,
|
.get_one::<PathBuf>("pg-distrib-dir")
|
||||||
pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
|
.context("'pg-distrib-dir' is required")?
|
||||||
datadir: arg_matches.value_of("datadir").unwrap().into(),
|
.to_owned(),
|
||||||
|
datadir: arg_matches
|
||||||
|
.get_one::<PathBuf>("datadir")
|
||||||
|
.context("'datadir' is required")?
|
||||||
|
.to_owned(),
|
||||||
};
|
};
|
||||||
cfg.initdb()?;
|
cfg.initdb()?;
|
||||||
let srv = cfg.start_server()?;
|
let srv = cfg.start_server()?;
|
||||||
@@ -108,9 +63,77 @@ fn main() -> Result<()> {
|
|||||||
}
|
}
|
||||||
Some(("in-existing", arg_matches)) => wal_craft(
|
Some(("in-existing", arg_matches)) => wal_craft(
|
||||||
arg_matches,
|
arg_matches,
|
||||||
&mut postgres::Config::from_str(arg_matches.value_of("connection").unwrap())?
|
&mut postgres::Config::from_str(
|
||||||
.connect(postgres::NoTls)?,
|
arg_matches
|
||||||
|
.get_one::<String>("connection")
|
||||||
|
.context("'connection' is required")?,
|
||||||
|
)
|
||||||
|
.context(
|
||||||
|
"'connection' argument value could not be parsed as a postgres connection string",
|
||||||
|
)?
|
||||||
|
.connect(postgres::NoTls)?,
|
||||||
),
|
),
|
||||||
Some(_) => panic!("Unknown subcommand"),
|
Some(_) => panic!("Unknown subcommand"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn cli() -> Command {
|
||||||
|
let type_arg = &Arg::new("type")
|
||||||
|
.help("Type of WAL to craft")
|
||||||
|
.value_parser([
|
||||||
|
Simple::NAME,
|
||||||
|
LastWalRecordXlogSwitch::NAME,
|
||||||
|
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME,
|
||||||
|
WalRecordCrossingSegmentFollowedBySmallOne::NAME,
|
||||||
|
LastWalRecordCrossingSegment::NAME,
|
||||||
|
])
|
||||||
|
.required(true);
|
||||||
|
|
||||||
|
Command::new("Postgres WAL crafter")
|
||||||
|
.about("Crafts Postgres databases with specific WAL properties")
|
||||||
|
.subcommand(
|
||||||
|
Command::new("print-postgres-config")
|
||||||
|
.about("Print the configuration required for PostgreSQL server before running this script")
|
||||||
|
)
|
||||||
|
.subcommand(
|
||||||
|
Command::new("with-initdb")
|
||||||
|
.about("Craft WAL in a new data directory first initialized with initdb")
|
||||||
|
.arg(type_arg)
|
||||||
|
.arg(
|
||||||
|
Arg::new("datadir")
|
||||||
|
.help("Data directory for the Postgres server")
|
||||||
|
.value_parser(value_parser!(PathBuf))
|
||||||
|
.required(true)
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("pg-distrib-dir")
|
||||||
|
.long("pg-distrib-dir")
|
||||||
|
.value_parser(value_parser!(PathBuf))
|
||||||
|
.help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)")
|
||||||
|
.default_value("/usr/local")
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("pg-version")
|
||||||
|
.long("pg-version")
|
||||||
|
.help("Postgres version to use for the initial tenant")
|
||||||
|
.value_parser(value_parser!(u32))
|
||||||
|
.required(true)
|
||||||
|
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.subcommand(
|
||||||
|
Command::new("in-existing")
|
||||||
|
.about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.")
|
||||||
|
.arg(type_arg)
|
||||||
|
.arg(
|
||||||
|
Arg::new("connection")
|
||||||
|
.help("Connection string to the Postgres database to populate")
|
||||||
|
.required(true)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn verify_cli() {
|
||||||
|
cli().debug_assert();
|
||||||
|
}
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ serde = { version = "1.0", features = ["derive"] }
|
|||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
|
tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
|
||||||
tokio-util = { version = "0.7", features = ["io"] }
|
tokio-util = { version = "0.7", features = ["io"] }
|
||||||
toml_edit = { version = "0.13", features = ["easy"] }
|
toml_edit = { version = "0.14", features = ["easy"] }
|
||||||
tracing = "0.1.27"
|
tracing = "0.1.27"
|
||||||
|
|
||||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
|
|||||||
12
libs/safekeeper_api/Cargo.toml
Normal file
12
libs/safekeeper_api/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
[package]
|
||||||
|
name = "safekeeper_api"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
serde_with = "2.0"
|
||||||
|
const_format = "0.2.21"
|
||||||
|
|
||||||
|
utils = { path = "../utils" }
|
||||||
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
10
libs/safekeeper_api/src/lib.rs
Normal file
10
libs/safekeeper_api/src/lib.rs
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
use const_format::formatcp;
|
||||||
|
|
||||||
|
/// Public API types
|
||||||
|
pub mod models;
|
||||||
|
|
||||||
|
pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454;
|
||||||
|
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||||
|
|
||||||
|
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676;
|
||||||
|
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||||
24
libs/safekeeper_api/src/models.rs
Normal file
24
libs/safekeeper_api/src/models.rs
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_with::{serde_as, DisplayFromStr};
|
||||||
|
|
||||||
|
use utils::{
|
||||||
|
id::{NodeId, TenantId, TimelineId},
|
||||||
|
lsn::Lsn,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[serde_as]
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct TimelineCreateRequest {
|
||||||
|
#[serde_as(as = "DisplayFromStr")]
|
||||||
|
pub tenant_id: TenantId,
|
||||||
|
#[serde_as(as = "DisplayFromStr")]
|
||||||
|
pub timeline_id: TimelineId,
|
||||||
|
pub peer_ids: Option<Vec<NodeId>>,
|
||||||
|
pub pg_version: u32,
|
||||||
|
pub system_id: Option<u64>,
|
||||||
|
pub wal_seg_size: Option<u32>,
|
||||||
|
#[serde_as(as = "DisplayFromStr")]
|
||||||
|
pub commit_lsn: Lsn,
|
||||||
|
// If not passed, it is assigned to the beginning of commit_lsn segment.
|
||||||
|
pub local_start_lsn: Option<Lsn>,
|
||||||
|
}
|
||||||
@@ -20,7 +20,7 @@ tokio = { version = "1.17", features = ["macros"]}
|
|||||||
tokio-rustls = "0.23"
|
tokio-rustls = "0.23"
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
nix = "0.23.0"
|
nix = "0.25"
|
||||||
signal-hook = "0.3.10"
|
signal-hook = "0.3.10"
|
||||||
rand = "0.8.3"
|
rand = "0.8.3"
|
||||||
jsonwebtoken = "8"
|
jsonwebtoken = "8"
|
||||||
@@ -28,7 +28,7 @@ hex = { version = "0.4.3", features = ["serde"] }
|
|||||||
rustls = "0.20.2"
|
rustls = "0.20.2"
|
||||||
rustls-split = "0.3.0"
|
rustls-split = "0.3.0"
|
||||||
git-version = "0.3.5"
|
git-version = "0.3.5"
|
||||||
serde_with = "1.12.0"
|
serde_with = "2.0"
|
||||||
once_cell = "1.13.0"
|
once_cell = "1.13.0"
|
||||||
|
|
||||||
|
|
||||||
@@ -40,7 +40,7 @@ byteorder = "1.4.3"
|
|||||||
bytes = "1.0.1"
|
bytes = "1.0.1"
|
||||||
hex-literal = "0.3"
|
hex-literal = "0.3"
|
||||||
tempfile = "3.2"
|
tempfile = "3.2"
|
||||||
criterion = "0.3"
|
criterion = "0.4"
|
||||||
rustls-pemfile = "1"
|
rustls-pemfile = "1"
|
||||||
|
|
||||||
[[bench]]
|
[[bench]]
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ use once_cell::sync::Lazy;
|
|||||||
use routerify::ext::RequestExt;
|
use routerify::ext::RequestExt;
|
||||||
use routerify::RequestInfo;
|
use routerify::RequestInfo;
|
||||||
use routerify::{Middleware, Router, RouterBuilder, RouterService};
|
use routerify::{Middleware, Router, RouterBuilder, RouterService};
|
||||||
|
use tokio::task::JoinError;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use std::future::Future;
|
use std::future::Future;
|
||||||
@@ -35,7 +36,13 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
|
|||||||
let mut buffer = vec![];
|
let mut buffer = vec![];
|
||||||
let encoder = TextEncoder::new();
|
let encoder = TextEncoder::new();
|
||||||
|
|
||||||
let metrics = metrics::gather();
|
let metrics = tokio::task::spawn_blocking(move || {
|
||||||
|
// Currently we take a lot of mutexes while collecting metrics, so it's
|
||||||
|
// better to spawn a blocking task to avoid blocking the event loop.
|
||||||
|
metrics::gather()
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||||
encoder.encode(&metrics, &mut buffer).unwrap();
|
encoder.encode(&metrics, &mut buffer).unwrap();
|
||||||
|
|
||||||
let response = Response::builder()
|
let response = Response::builder()
|
||||||
|
|||||||
@@ -66,6 +66,11 @@ impl Lsn {
|
|||||||
(self.0 % seg_sz as u64) as usize
|
(self.0 % seg_sz as u64) as usize
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Compute LSN of the segment start.
|
||||||
|
pub fn segment_lsn(self, seg_sz: usize) -> Lsn {
|
||||||
|
Lsn(self.0 - (self.0 % seg_sz as u64))
|
||||||
|
}
|
||||||
|
|
||||||
/// Compute the segment number
|
/// Compute the segment number
|
||||||
pub fn segment_number(self, seg_sz: usize) -> u64 {
|
pub fn segment_number(self, seg_sz: usize) -> u64 {
|
||||||
self.0 / seg_sz as u64
|
self.0 / seg_sz as u64
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ use std::sync::Arc;
|
|||||||
use std::task::Poll;
|
use std::task::Poll;
|
||||||
use tracing::{debug, error, trace};
|
use tracing::{debug, error, trace};
|
||||||
|
|
||||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
|
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
|
||||||
use tokio_rustls::TlsAcceptor;
|
use tokio_rustls::TlsAcceptor;
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
@@ -66,8 +66,8 @@ pub enum ProcessMsgResult {
|
|||||||
/// Always-writeable sock_split stream.
|
/// Always-writeable sock_split stream.
|
||||||
/// May not be readable. See [`PostgresBackend::take_stream_in`]
|
/// May not be readable. See [`PostgresBackend::take_stream_in`]
|
||||||
pub enum Stream {
|
pub enum Stream {
|
||||||
Unencrypted(tokio::net::TcpStream),
|
Unencrypted(BufReader<tokio::net::TcpStream>),
|
||||||
Tls(Box<tokio_rustls::server::TlsStream<tokio::net::TcpStream>>),
|
Tls(Box<tokio_rustls::server::TlsStream<BufReader<tokio::net::TcpStream>>>),
|
||||||
Broken,
|
Broken,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -157,7 +157,7 @@ impl PostgresBackend {
|
|||||||
let peer_addr = socket.peer_addr()?;
|
let peer_addr = socket.peer_addr()?;
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
stream: Stream::Unencrypted(socket),
|
stream: Stream::Unencrypted(BufReader::new(socket)),
|
||||||
buf_out: BytesMut::with_capacity(10 * 1024),
|
buf_out: BytesMut::with_capacity(10 * 1024),
|
||||||
state: ProtoState::Initialization,
|
state: ProtoState::Initialization,
|
||||||
md5_salt: [0u8; 4],
|
md5_salt: [0u8; 4],
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
use std::{
|
use std::{
|
||||||
borrow::Cow,
|
borrow::Cow,
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
|
fmt,
|
||||||
future::Future,
|
future::Future,
|
||||||
io::{self, Cursor},
|
io::{self, Cursor},
|
||||||
str,
|
str,
|
||||||
@@ -124,6 +125,19 @@ pub struct CancelKeyData {
|
|||||||
pub cancel_key: i32,
|
pub cancel_key: i32,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for CancelKeyData {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
let hi = (self.backend_pid as u64) << 32;
|
||||||
|
let lo = self.cancel_key as u64;
|
||||||
|
let id = hi | lo;
|
||||||
|
|
||||||
|
// This format is more compact and might work better for logs.
|
||||||
|
f.debug_tuple("CancelKeyData")
|
||||||
|
.field(&format_args!("{:x}", id))
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
use rand::distributions::{Distribution, Standard};
|
use rand::distributions::{Distribution, Standard};
|
||||||
impl Distribution<CancelKeyData> for Standard {
|
impl Distribution<CancelKeyData> for Standard {
|
||||||
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> CancelKeyData {
|
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> CancelKeyData {
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ futures = "0.3.13"
|
|||||||
hex = "0.4.3"
|
hex = "0.4.3"
|
||||||
hyper = "0.14"
|
hyper = "0.14"
|
||||||
itertools = "0.10.3"
|
itertools = "0.10.3"
|
||||||
clap = "3.0"
|
clap = { version = "4.0", features = ["string"] }
|
||||||
daemonize = "0.4.1"
|
daemonize = "0.4.1"
|
||||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||||
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
|
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
|
||||||
@@ -38,26 +38,27 @@ tar = "0.4.33"
|
|||||||
humantime = "2.1.0"
|
humantime = "2.1.0"
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
serde_with = "1.12.0"
|
serde_with = "2.0"
|
||||||
humantime-serde = "1.1.1"
|
humantime-serde = "1.1.1"
|
||||||
|
|
||||||
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
|
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
|
||||||
|
|
||||||
toml_edit = { version = "0.13", features = ["easy"] }
|
toml_edit = { version = "0.14", features = ["easy"] }
|
||||||
scopeguard = "1.1.0"
|
scopeguard = "1.1.0"
|
||||||
const_format = "0.2.21"
|
const_format = "0.2.21"
|
||||||
tracing = "0.1.36"
|
tracing = "0.1.36"
|
||||||
signal-hook = "0.3.10"
|
signal-hook = "0.3.10"
|
||||||
url = "2"
|
url = "2"
|
||||||
nix = "0.23"
|
nix = "0.25"
|
||||||
once_cell = "1.13.0"
|
once_cell = "1.13.0"
|
||||||
crossbeam-utils = "0.8.5"
|
crossbeam-utils = "0.8.5"
|
||||||
fail = "0.5.0"
|
fail = "0.5.0"
|
||||||
git-version = "0.3.5"
|
git-version = "0.3.5"
|
||||||
rstar = "0.9.3"
|
rstar = "0.9.3"
|
||||||
num-traits = "0.2.15"
|
num-traits = "0.2.15"
|
||||||
amplify_num = "0.4.1"
|
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
|
||||||
|
|
||||||
|
pageserver_api = { path = "../libs/pageserver_api" }
|
||||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||||
etcd_broker = { path = "../libs/etcd_broker" }
|
etcd_broker = { path = "../libs/etcd_broker" }
|
||||||
metrics = { path = "../libs/metrics" }
|
metrics = { path = "../libs/metrics" }
|
||||||
@@ -66,7 +67,13 @@ remote_storage = { path = "../libs/remote_storage" }
|
|||||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
close_fds = "0.3.2"
|
close_fds = "0.3.2"
|
||||||
walkdir = "2.3.2"
|
walkdir = "2.3.2"
|
||||||
|
dashmap = "5.4.0"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
criterion = "0.4"
|
||||||
hex-literal = "0.3"
|
hex-literal = "0.3"
|
||||||
tempfile = "3.2"
|
tempfile = "3.2"
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "bench_layer_map"
|
||||||
|
harness = false
|
||||||
|
|||||||
5866
pageserver/benches/bench_layer_map.rs
Normal file
5866
pageserver/benches/bench_layer_map.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,35 +0,0 @@
|
|||||||
//! Main entry point for the dump_layerfile executable
|
|
||||||
//!
|
|
||||||
//! A handy tool for debugging, that's all.
|
|
||||||
use anyhow::Result;
|
|
||||||
use clap::{App, Arg};
|
|
||||||
use pageserver::page_cache;
|
|
||||||
use pageserver::tenant::dump_layerfile_from_path;
|
|
||||||
use pageserver::virtual_file;
|
|
||||||
use std::path::PathBuf;
|
|
||||||
use utils::project_git_version;
|
|
||||||
|
|
||||||
project_git_version!(GIT_VERSION);
|
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
|
||||||
let arg_matches = App::new("Neon dump_layerfile utility")
|
|
||||||
.about("Dump contents of one layer file, for debugging")
|
|
||||||
.version(GIT_VERSION)
|
|
||||||
.arg(
|
|
||||||
Arg::new("path")
|
|
||||||
.help("Path to file to dump")
|
|
||||||
.required(true)
|
|
||||||
.index(1),
|
|
||||||
)
|
|
||||||
.get_matches();
|
|
||||||
|
|
||||||
let path = PathBuf::from(arg_matches.value_of("path").unwrap());
|
|
||||||
|
|
||||||
// Basic initialization of things that don't change after startup
|
|
||||||
virtual_file::init(10);
|
|
||||||
page_cache::init(100);
|
|
||||||
|
|
||||||
dump_layerfile_from_path(&path, true)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -6,10 +6,12 @@ use tracing::*;
|
|||||||
|
|
||||||
use anyhow::{anyhow, bail, Context, Result};
|
use anyhow::{anyhow, bail, Context, Result};
|
||||||
|
|
||||||
use clap::{App, Arg};
|
use clap::{Arg, ArgAction, Command};
|
||||||
use daemonize::Daemonize;
|
use daemonize::Daemonize;
|
||||||
|
|
||||||
use fail::FailScenario;
|
use fail::FailScenario;
|
||||||
|
use metrics::set_build_info_metric;
|
||||||
|
|
||||||
use pageserver::{
|
use pageserver::{
|
||||||
config::{defaults::*, PageServerConf},
|
config::{defaults::*, PageServerConf},
|
||||||
http, page_cache, page_service, profiling, task_mgr,
|
http, page_cache, page_service, profiling, task_mgr,
|
||||||
@@ -31,72 +33,35 @@ use utils::{
|
|||||||
|
|
||||||
project_git_version!(GIT_VERSION);
|
project_git_version!(GIT_VERSION);
|
||||||
|
|
||||||
|
const FEATURES: &[&str] = &[
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
"testing",
|
||||||
|
#[cfg(feature = "fail/failpoints")]
|
||||||
|
"fail/failpoints",
|
||||||
|
#[cfg(feature = "profiling")]
|
||||||
|
"profiling",
|
||||||
|
];
|
||||||
|
|
||||||
fn version() -> String {
|
fn version() -> String {
|
||||||
format!(
|
format!(
|
||||||
"{GIT_VERSION} profiling:{} failpoints:{}",
|
"{GIT_VERSION} failpoints: {}, features: {:?}",
|
||||||
cfg!(feature = "profiling"),
|
fail::has_failpoints(),
|
||||||
fail::has_failpoints()
|
FEATURES,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> anyhow::Result<()> {
|
fn main() -> anyhow::Result<()> {
|
||||||
let arg_matches = App::new("Neon page server")
|
let arg_matches = cli().get_matches();
|
||||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
|
||||||
.version(&*version())
|
|
||||||
.arg(
|
|
||||||
|
|
||||||
Arg::new("daemonize")
|
if arg_matches.get_flag("enabled-features") {
|
||||||
.short('d')
|
println!("{{\"features\": {FEATURES:?} }}");
|
||||||
.long("daemonize")
|
|
||||||
.takes_value(false)
|
|
||||||
.help("Run in the background"),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("init")
|
|
||||||
.long("init")
|
|
||||||
.takes_value(false)
|
|
||||||
.help("Initialize pageserver with all given config overrides"),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("workdir")
|
|
||||||
.short('D')
|
|
||||||
.long("workdir")
|
|
||||||
.takes_value(true)
|
|
||||||
.help("Working directory for the pageserver"),
|
|
||||||
)
|
|
||||||
// See `settings.md` for more details on the extra configuration patameters pageserver can process
|
|
||||||
.arg(
|
|
||||||
Arg::new("config-override")
|
|
||||||
.short('c')
|
|
||||||
.takes_value(true)
|
|
||||||
.number_of_values(1)
|
|
||||||
.multiple_occurrences(true)
|
|
||||||
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there).
|
|
||||||
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
|
|
||||||
)
|
|
||||||
.arg(Arg::new("update-config").long("update-config").takes_value(false).help(
|
|
||||||
"Update the config file when started",
|
|
||||||
))
|
|
||||||
.arg(
|
|
||||||
Arg::new("enabled-features")
|
|
||||||
.long("enabled-features")
|
|
||||||
.takes_value(false)
|
|
||||||
.help("Show enabled compile time features"),
|
|
||||||
)
|
|
||||||
.get_matches();
|
|
||||||
|
|
||||||
if arg_matches.is_present("enabled-features") {
|
|
||||||
let features: &[&str] = &[
|
|
||||||
#[cfg(feature = "testing")]
|
|
||||||
"testing",
|
|
||||||
#[cfg(feature = "profiling")]
|
|
||||||
"profiling",
|
|
||||||
];
|
|
||||||
println!("{{\"features\": {features:?} }}");
|
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".neon"));
|
let workdir = arg_matches
|
||||||
|
.get_one::<String>("workdir")
|
||||||
|
.map(Path::new)
|
||||||
|
.unwrap_or_else(|| Path::new(".neon"));
|
||||||
let workdir = workdir
|
let workdir = workdir
|
||||||
.canonicalize()
|
.canonicalize()
|
||||||
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
|
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
|
||||||
@@ -110,7 +75,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let daemonize = arg_matches.is_present("daemonize");
|
let daemonize = arg_matches.get_flag("daemonize");
|
||||||
|
|
||||||
let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
|
let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
|
||||||
ControlFlow::Continue(conf) => conf,
|
ControlFlow::Continue(conf) => conf,
|
||||||
@@ -148,8 +113,8 @@ fn initialize_config(
|
|||||||
arg_matches: clap::ArgMatches,
|
arg_matches: clap::ArgMatches,
|
||||||
workdir: &Path,
|
workdir: &Path,
|
||||||
) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
|
) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
|
||||||
let init = arg_matches.is_present("init");
|
let init = arg_matches.get_flag("init");
|
||||||
let update_config = init || arg_matches.is_present("update-config");
|
let update_config = init || arg_matches.get_flag("update-config");
|
||||||
|
|
||||||
let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
|
let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
|
||||||
if init {
|
if init {
|
||||||
@@ -191,13 +156,10 @@ fn initialize_config(
|
|||||||
)
|
)
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Some(values) = arg_matches.values_of("config-override") {
|
if let Some(values) = arg_matches.get_many::<String>("config-override") {
|
||||||
for option_line in values {
|
for option_line in values {
|
||||||
let doc = toml_edit::Document::from_str(option_line).with_context(|| {
|
let doc = toml_edit::Document::from_str(option_line).with_context(|| {
|
||||||
format!(
|
format!("Option '{option_line}' could not be parsed as a toml document")
|
||||||
"Option '{}' could not be parsed as a toml document",
|
|
||||||
option_line
|
|
||||||
)
|
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
for (key, item) in doc.iter() {
|
for (key, item) in doc.iter() {
|
||||||
@@ -239,7 +201,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
|||||||
// Initialize logger
|
// Initialize logger
|
||||||
let log_file = logging::init(LOG_FILE_NAME, daemonize)?;
|
let log_file = logging::init(LOG_FILE_NAME, daemonize)?;
|
||||||
|
|
||||||
info!("version: {GIT_VERSION}");
|
info!("version: {}", version());
|
||||||
|
|
||||||
// TODO: Check that it looks like a valid repository before going further
|
// TODO: Check that it looks like a valid repository before going further
|
||||||
|
|
||||||
@@ -356,6 +318,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
|
set_build_info_metric(GIT_VERSION);
|
||||||
|
|
||||||
// All started up! Now just sit and wait for shutdown signal.
|
// All started up! Now just sit and wait for shutdown signal.
|
||||||
signals.handle(|signal| match signal {
|
signals.handle(|signal| match signal {
|
||||||
Signal::Quit => {
|
Signal::Quit => {
|
||||||
@@ -378,3 +342,55 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn cli() -> Command {
|
||||||
|
Command::new("Neon page server")
|
||||||
|
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||||
|
.version(version())
|
||||||
|
.arg(
|
||||||
|
|
||||||
|
Arg::new("daemonize")
|
||||||
|
.short('d')
|
||||||
|
.long("daemonize")
|
||||||
|
.action(ArgAction::SetTrue)
|
||||||
|
.help("Run in the background"),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("init")
|
||||||
|
.long("init")
|
||||||
|
.action(ArgAction::SetTrue)
|
||||||
|
.help("Initialize pageserver with all given config overrides"),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("workdir")
|
||||||
|
.short('D')
|
||||||
|
.long("workdir")
|
||||||
|
.help("Working directory for the pageserver"),
|
||||||
|
)
|
||||||
|
// See `settings.md` for more details on the extra configuration patameters pageserver can process
|
||||||
|
.arg(
|
||||||
|
Arg::new("config-override")
|
||||||
|
.short('c')
|
||||||
|
.num_args(1)
|
||||||
|
.action(ArgAction::Append)
|
||||||
|
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
|
||||||
|
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("update-config")
|
||||||
|
.long("update-config")
|
||||||
|
.action(ArgAction::SetTrue)
|
||||||
|
.help("Update the config file when started"),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("enabled-features")
|
||||||
|
.long("enabled-features")
|
||||||
|
.action(ArgAction::SetTrue)
|
||||||
|
.help("Show enabled compile time features"),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn verify_cli() {
|
||||||
|
cli().debug_assert();
|
||||||
|
}
|
||||||
|
|||||||
154
pageserver/src/bin/pageserver_binutils.rs
Normal file
154
pageserver/src/bin/pageserver_binutils.rs
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
//! A helper tool to manage pageserver binary files.
|
||||||
|
//! Accepts a file as an argument, attempts to parse it with all ways possible
|
||||||
|
//! and prints its interpreted context.
|
||||||
|
//!
|
||||||
|
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
|
||||||
|
use std::{
|
||||||
|
path::{Path, PathBuf},
|
||||||
|
str::FromStr,
|
||||||
|
};
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use clap::{value_parser, Arg, Command};
|
||||||
|
|
||||||
|
use pageserver::{
|
||||||
|
page_cache,
|
||||||
|
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
||||||
|
virtual_file,
|
||||||
|
};
|
||||||
|
use postgres_ffi::ControlFileData;
|
||||||
|
use utils::{lsn::Lsn, project_git_version};
|
||||||
|
|
||||||
|
project_git_version!(GIT_VERSION);
|
||||||
|
|
||||||
|
const METADATA_SUBCOMMAND: &str = "metadata";
|
||||||
|
|
||||||
|
fn main() -> anyhow::Result<()> {
|
||||||
|
let arg_matches = cli().get_matches();
|
||||||
|
|
||||||
|
match arg_matches.subcommand() {
|
||||||
|
Some((subcommand_name, subcommand_matches)) => {
|
||||||
|
let path = subcommand_matches
|
||||||
|
.get_one::<PathBuf>("metadata_path")
|
||||||
|
.context("'metadata_path' argument is missing")?
|
||||||
|
.to_path_buf();
|
||||||
|
anyhow::ensure!(
|
||||||
|
subcommand_name == METADATA_SUBCOMMAND,
|
||||||
|
"Unknown subcommand {subcommand_name}"
|
||||||
|
);
|
||||||
|
handle_metadata(&path, subcommand_matches)?;
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let path = arg_matches
|
||||||
|
.get_one::<PathBuf>("path")
|
||||||
|
.context("'path' argument is missing")?
|
||||||
|
.to_path_buf();
|
||||||
|
println!(
|
||||||
|
"No subcommand specified, attempting to guess the format for file {}",
|
||||||
|
path.display()
|
||||||
|
);
|
||||||
|
if let Err(e) = read_pg_control_file(&path) {
|
||||||
|
println!(
|
||||||
|
"Failed to read input file as a pg control one: {e:#}\n\
|
||||||
|
Attempting to read it as layer file"
|
||||||
|
);
|
||||||
|
print_layerfile(&path)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
|
||||||
|
let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?;
|
||||||
|
println!("{control_file:?}");
|
||||||
|
let control_file_initdb = Lsn(control_file.checkPoint);
|
||||||
|
println!(
|
||||||
|
"pg_initdb_lsn: {}, aligned: {}",
|
||||||
|
control_file_initdb,
|
||||||
|
control_file_initdb.align()
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn print_layerfile(path: &Path) -> anyhow::Result<()> {
|
||||||
|
// Basic initialization of things that don't change after startup
|
||||||
|
virtual_file::init(10);
|
||||||
|
page_cache::init(100);
|
||||||
|
dump_layerfile_from_path(path, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
|
||||||
|
let metadata_bytes = std::fs::read(&path)?;
|
||||||
|
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
||||||
|
println!("Current metadata:\n{meta:?}");
|
||||||
|
let mut update_meta = false;
|
||||||
|
if let Some(disk_consistent_lsn) = arg_matches.get_one::<String>("disk_consistent_lsn") {
|
||||||
|
meta = TimelineMetadata::new(
|
||||||
|
Lsn::from_str(disk_consistent_lsn)?,
|
||||||
|
meta.prev_record_lsn(),
|
||||||
|
meta.ancestor_timeline(),
|
||||||
|
meta.ancestor_lsn(),
|
||||||
|
meta.latest_gc_cutoff_lsn(),
|
||||||
|
meta.initdb_lsn(),
|
||||||
|
meta.pg_version(),
|
||||||
|
);
|
||||||
|
update_meta = true;
|
||||||
|
}
|
||||||
|
if let Some(prev_record_lsn) = arg_matches.get_one::<String>("prev_record_lsn") {
|
||||||
|
meta = TimelineMetadata::new(
|
||||||
|
meta.disk_consistent_lsn(),
|
||||||
|
Some(Lsn::from_str(prev_record_lsn)?),
|
||||||
|
meta.ancestor_timeline(),
|
||||||
|
meta.ancestor_lsn(),
|
||||||
|
meta.latest_gc_cutoff_lsn(),
|
||||||
|
meta.initdb_lsn(),
|
||||||
|
meta.pg_version(),
|
||||||
|
);
|
||||||
|
update_meta = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if update_meta {
|
||||||
|
let metadata_bytes = meta.to_bytes()?;
|
||||||
|
std::fs::write(&path, &metadata_bytes)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cli() -> Command {
|
||||||
|
Command::new("Neon Pageserver binutils")
|
||||||
|
.about("Reads pageserver (and related) binary files management utility")
|
||||||
|
.version(GIT_VERSION)
|
||||||
|
.arg(
|
||||||
|
Arg::new("path")
|
||||||
|
.help("Input file path")
|
||||||
|
.value_parser(value_parser!(PathBuf))
|
||||||
|
.required(false),
|
||||||
|
)
|
||||||
|
.subcommand(
|
||||||
|
Command::new(METADATA_SUBCOMMAND)
|
||||||
|
.about("Read and update pageserver metadata file")
|
||||||
|
.arg(
|
||||||
|
Arg::new("metadata_path")
|
||||||
|
.help("Input metadata file path")
|
||||||
|
.value_parser(value_parser!(PathBuf))
|
||||||
|
.required(false),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("disk_consistent_lsn")
|
||||||
|
.long("disk_consistent_lsn")
|
||||||
|
.help("Replace disk consistent Lsn"),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("prev_record_lsn")
|
||||||
|
.long("prev_record_lsn")
|
||||||
|
.help("Replace previous record Lsn"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn verify_cli() {
|
||||||
|
cli().debug_assert();
|
||||||
|
}
|
||||||
@@ -1,75 +0,0 @@
|
|||||||
//! Main entry point for the edit_metadata executable
|
|
||||||
//!
|
|
||||||
//! A handy tool for debugging, that's all.
|
|
||||||
use anyhow::Result;
|
|
||||||
use clap::{App, Arg};
|
|
||||||
use pageserver::tenant::metadata::TimelineMetadata;
|
|
||||||
use std::path::PathBuf;
|
|
||||||
use std::str::FromStr;
|
|
||||||
use utils::{lsn::Lsn, project_git_version};
|
|
||||||
|
|
||||||
project_git_version!(GIT_VERSION);
|
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
|
||||||
let arg_matches = App::new("Neon update metadata utility")
|
|
||||||
.about("Dump or update metadata file")
|
|
||||||
.version(GIT_VERSION)
|
|
||||||
.arg(
|
|
||||||
Arg::new("path")
|
|
||||||
.help("Path to metadata file")
|
|
||||||
.required(true),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("disk_lsn")
|
|
||||||
.short('d')
|
|
||||||
.long("disk_lsn")
|
|
||||||
.takes_value(true)
|
|
||||||
.help("Replace disk constistent lsn"),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("prev_lsn")
|
|
||||||
.short('p')
|
|
||||||
.long("prev_lsn")
|
|
||||||
.takes_value(true)
|
|
||||||
.help("Previous record LSN"),
|
|
||||||
)
|
|
||||||
.get_matches();
|
|
||||||
|
|
||||||
let path = PathBuf::from(arg_matches.value_of("path").unwrap());
|
|
||||||
let metadata_bytes = std::fs::read(&path)?;
|
|
||||||
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
|
||||||
println!("Current metadata:\n{:?}", &meta);
|
|
||||||
|
|
||||||
let mut update_meta = false;
|
|
||||||
|
|
||||||
if let Some(disk_lsn) = arg_matches.value_of("disk_lsn") {
|
|
||||||
meta = TimelineMetadata::new(
|
|
||||||
Lsn::from_str(disk_lsn)?,
|
|
||||||
meta.prev_record_lsn(),
|
|
||||||
meta.ancestor_timeline(),
|
|
||||||
meta.ancestor_lsn(),
|
|
||||||
meta.latest_gc_cutoff_lsn(),
|
|
||||||
meta.initdb_lsn(),
|
|
||||||
meta.pg_version(),
|
|
||||||
);
|
|
||||||
update_meta = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(prev_lsn) = arg_matches.value_of("prev_lsn") {
|
|
||||||
meta = TimelineMetadata::new(
|
|
||||||
meta.disk_consistent_lsn(),
|
|
||||||
Some(Lsn::from_str(prev_lsn)?),
|
|
||||||
meta.ancestor_timeline(),
|
|
||||||
meta.ancestor_lsn(),
|
|
||||||
meta.latest_gc_cutoff_lsn(),
|
|
||||||
meta.initdb_lsn(),
|
|
||||||
meta.pg_version(),
|
|
||||||
);
|
|
||||||
update_meta = true;
|
|
||||||
}
|
|
||||||
if update_meta {
|
|
||||||
let metadata_bytes = meta.to_bytes()?;
|
|
||||||
std::fs::write(&path, &metadata_bytes)?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -30,10 +30,10 @@ pub mod defaults {
|
|||||||
use crate::tenant_config::defaults::*;
|
use crate::tenant_config::defaults::*;
|
||||||
use const_format::formatcp;
|
use const_format::formatcp;
|
||||||
|
|
||||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
pub use pageserver_api::{
|
||||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
DEFAULT_PG_LISTEN_PORT,
|
||||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
};
|
||||||
|
|
||||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
||||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
pub mod models;
|
|
||||||
pub mod routes;
|
pub mod routes;
|
||||||
pub use routes::make_router;
|
pub use routes::make_router;
|
||||||
|
|
||||||
|
pub use pageserver_api::models;
|
||||||
|
|||||||
@@ -1,7 +1,11 @@
|
|||||||
openapi: "3.0.2"
|
openapi: "3.0.2"
|
||||||
info:
|
info:
|
||||||
title: Page Server API
|
title: Page Server API
|
||||||
|
description: Neon Pageserver API
|
||||||
version: "1.0"
|
version: "1.0"
|
||||||
|
license:
|
||||||
|
name: "Apache"
|
||||||
|
url: https://github.com/neondatabase/neon/blob/main/LICENSE
|
||||||
servers:
|
servers:
|
||||||
- url: ""
|
- url: ""
|
||||||
paths:
|
paths:
|
||||||
@@ -207,6 +211,61 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/Error"
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
|
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
|
||||||
|
parameters:
|
||||||
|
- name: tenant_id
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
|
- name: timeline_id
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
|
get:
|
||||||
|
description: Get LSN by a timestamp
|
||||||
|
parameters:
|
||||||
|
- name: timestamp
|
||||||
|
in: query
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
format: date-time
|
||||||
|
description: A timestamp to get the LSN
|
||||||
|
responses:
|
||||||
|
"200":
|
||||||
|
description: OK
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
"400":
|
||||||
|
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
"401":
|
||||||
|
description: Unauthorized Error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/UnauthorizedError"
|
||||||
|
"403":
|
||||||
|
description: Forbidden Error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ForbiddenError"
|
||||||
|
"500":
|
||||||
|
description: Generic operation error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
/v1/tenant/{tenant_id}/attach:
|
/v1/tenant/{tenant_id}/attach:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_id
|
||||||
@@ -556,6 +615,9 @@ components:
|
|||||||
required:
|
required:
|
||||||
- timeline_id
|
- timeline_id
|
||||||
- tenant_id
|
- tenant_id
|
||||||
|
- last_record_lsn
|
||||||
|
- disk_consistent_lsn
|
||||||
|
- awaits_download
|
||||||
properties:
|
properties:
|
||||||
timeline_id:
|
timeline_id:
|
||||||
type: string
|
type: string
|
||||||
@@ -563,33 +625,15 @@ components:
|
|||||||
tenant_id:
|
tenant_id:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
format: hex
|
||||||
local:
|
|
||||||
$ref: "#/components/schemas/LocalTimelineInfo"
|
|
||||||
remote:
|
|
||||||
$ref: "#/components/schemas/RemoteTimelineInfo"
|
|
||||||
RemoteTimelineInfo:
|
|
||||||
type: object
|
|
||||||
required:
|
|
||||||
- awaits_download
|
|
||||||
- remote_consistent_lsn
|
|
||||||
properties:
|
|
||||||
awaits_download:
|
|
||||||
type: boolean
|
|
||||||
remote_consistent_lsn:
|
|
||||||
type: string
|
|
||||||
format: hex
|
|
||||||
LocalTimelineInfo:
|
|
||||||
type: object
|
|
||||||
required:
|
|
||||||
- last_record_lsn
|
|
||||||
- disk_consistent_lsn
|
|
||||||
properties:
|
|
||||||
last_record_lsn:
|
last_record_lsn:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
format: hex
|
||||||
disk_consistent_lsn:
|
disk_consistent_lsn:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
format: hex
|
||||||
|
remote_consistent_lsn:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
ancestor_timeline_id:
|
ancestor_timeline_id:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
format: hex
|
||||||
@@ -614,7 +658,39 @@ components:
|
|||||||
format: hex
|
format: hex
|
||||||
last_received_msg_ts:
|
last_received_msg_ts:
|
||||||
type: integer
|
type: integer
|
||||||
|
awaits_download:
|
||||||
|
type: boolean
|
||||||
|
|
||||||
|
# These 'local' and 'remote' fields just duplicate some of the fields
|
||||||
|
# above. They are kept for backwards-compatibility. They can be removed,
|
||||||
|
# when the control plane has been updated to look at the above fields
|
||||||
|
# directly.
|
||||||
|
local:
|
||||||
|
$ref: "#/components/schemas/LocalTimelineInfo"
|
||||||
|
remote:
|
||||||
|
$ref: "#/components/schemas/RemoteTimelineInfo"
|
||||||
|
|
||||||
|
LocalTimelineInfo:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
ancestor_timeline_id:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
|
ancestor_lsn:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
|
current_logical_size:
|
||||||
|
type: integer
|
||||||
|
current_physical_size:
|
||||||
|
type: integer
|
||||||
|
RemoteTimelineInfo:
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- remote_consistent_lsn
|
||||||
|
properties:
|
||||||
|
remote_consistent_lsn:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
Error:
|
Error:
|
||||||
type: object
|
type: object
|
||||||
required:
|
required:
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ use super::models::{
|
|||||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||||
TimelineCreateRequest,
|
TimelineCreateRequest,
|
||||||
};
|
};
|
||||||
|
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||||
use crate::storage_sync;
|
use crate::storage_sync;
|
||||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
|
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
|
||||||
use crate::tenant::{TenantState, Timeline};
|
use crate::tenant::{TenantState, Timeline};
|
||||||
@@ -78,13 +79,13 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
|
|||||||
get_state(request).conf
|
get_state(request).conf
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper functions to construct a LocalTimelineInfo struct for a timeline
|
// Helper function to construct a TimelineInfo struct for a timeline
|
||||||
|
async fn build_timeline_info(
|
||||||
fn local_timeline_info_from_timeline(
|
state: &State,
|
||||||
timeline: &Arc<Timeline>,
|
timeline: &Arc<Timeline>,
|
||||||
include_non_incremental_logical_size: bool,
|
include_non_incremental_logical_size: bool,
|
||||||
include_non_incremental_physical_size: bool,
|
include_non_incremental_physical_size: bool,
|
||||||
) -> anyhow::Result<LocalTimelineInfo> {
|
) -> anyhow::Result<TimelineInfo> {
|
||||||
let last_record_lsn = timeline.get_last_record_lsn();
|
let last_record_lsn = timeline.get_last_record_lsn();
|
||||||
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
|
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
|
||||||
let guard = timeline.last_received_wal.lock().unwrap();
|
let guard = timeline.last_received_wal.lock().unwrap();
|
||||||
@@ -99,24 +100,47 @@ fn local_timeline_info_from_timeline(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let info = LocalTimelineInfo {
|
let (remote_consistent_lsn, awaits_download) = if let Some(remote_entry) = state
|
||||||
ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
|
.remote_index
|
||||||
ancestor_lsn: {
|
.read()
|
||||||
match timeline.get_ancestor_lsn() {
|
.await
|
||||||
Lsn(0) => None,
|
.timeline_entry(&TenantTimelineId {
|
||||||
lsn @ Lsn(_) => Some(lsn),
|
tenant_id: timeline.tenant_id,
|
||||||
}
|
timeline_id: timeline.timeline_id,
|
||||||
},
|
}) {
|
||||||
|
(
|
||||||
|
Some(remote_entry.metadata.disk_consistent_lsn()),
|
||||||
|
remote_entry.awaits_download,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
(None, false)
|
||||||
|
};
|
||||||
|
|
||||||
|
let ancestor_timeline_id = timeline.get_ancestor_timeline_id();
|
||||||
|
let ancestor_lsn = match timeline.get_ancestor_lsn() {
|
||||||
|
Lsn(0) => None,
|
||||||
|
lsn @ Lsn(_) => Some(lsn),
|
||||||
|
};
|
||||||
|
let current_logical_size = match timeline.get_current_logical_size() {
|
||||||
|
Ok(size) => Some(size),
|
||||||
|
Err(err) => {
|
||||||
|
error!("Timeline info creation failed to get current logical size: {err:?}");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let current_physical_size = Some(timeline.get_physical_size());
|
||||||
|
|
||||||
|
let info = TimelineInfo {
|
||||||
|
tenant_id: timeline.tenant_id,
|
||||||
|
timeline_id: timeline.timeline_id,
|
||||||
|
ancestor_timeline_id,
|
||||||
|
ancestor_lsn,
|
||||||
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
|
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
|
||||||
last_record_lsn,
|
last_record_lsn,
|
||||||
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
||||||
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
|
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
|
||||||
current_logical_size: Some(
|
current_logical_size,
|
||||||
timeline
|
current_physical_size,
|
||||||
.get_current_logical_size()
|
|
||||||
.context("Timeline info creation failed to get current logical size")?,
|
|
||||||
),
|
|
||||||
current_physical_size: Some(timeline.get_physical_size()),
|
|
||||||
current_logical_size_non_incremental: if include_non_incremental_logical_size {
|
current_logical_size_non_incremental: if include_non_incremental_logical_size {
|
||||||
Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?)
|
Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?)
|
||||||
} else {
|
} else {
|
||||||
@@ -131,32 +155,25 @@ fn local_timeline_info_from_timeline(
|
|||||||
last_received_msg_lsn,
|
last_received_msg_lsn,
|
||||||
last_received_msg_ts,
|
last_received_msg_ts,
|
||||||
pg_version: timeline.pg_version,
|
pg_version: timeline.pg_version,
|
||||||
|
|
||||||
|
remote_consistent_lsn,
|
||||||
|
awaits_download,
|
||||||
|
|
||||||
|
// Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
|
||||||
|
// with the control plane.
|
||||||
|
local: LocalTimelineInfo {
|
||||||
|
ancestor_timeline_id,
|
||||||
|
ancestor_lsn,
|
||||||
|
current_logical_size,
|
||||||
|
current_physical_size,
|
||||||
|
},
|
||||||
|
remote: RemoteTimelineInfo {
|
||||||
|
remote_consistent_lsn,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
Ok(info)
|
Ok(info)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn list_local_timelines(
|
|
||||||
tenant_id: TenantId,
|
|
||||||
include_non_incremental_logical_size: bool,
|
|
||||||
include_non_incremental_physical_size: bool,
|
|
||||||
) -> Result<Vec<(TimelineId, LocalTimelineInfo)>> {
|
|
||||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
|
|
||||||
let timelines = tenant.list_timelines();
|
|
||||||
|
|
||||||
let mut local_timeline_info = Vec::with_capacity(timelines.len());
|
|
||||||
for (timeline_id, repository_timeline) in timelines {
|
|
||||||
local_timeline_info.push((
|
|
||||||
timeline_id,
|
|
||||||
local_timeline_info_from_timeline(
|
|
||||||
&repository_timeline,
|
|
||||||
include_non_incremental_logical_size,
|
|
||||||
include_non_incremental_physical_size,
|
|
||||||
)?,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
Ok(local_timeline_info)
|
|
||||||
}
|
|
||||||
|
|
||||||
// healthcheck handler
|
// healthcheck handler
|
||||||
async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
let config = get_config(&request);
|
let config = get_config(&request);
|
||||||
@@ -168,6 +185,8 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
|||||||
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
|
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
|
let state = get_state(&request);
|
||||||
|
|
||||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||||
let new_timeline_info = async {
|
let new_timeline_info = async {
|
||||||
match tenant.create_timeline(
|
match tenant.create_timeline(
|
||||||
@@ -178,14 +197,10 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
|||||||
).await {
|
).await {
|
||||||
Ok(Some(new_timeline)) => {
|
Ok(Some(new_timeline)) => {
|
||||||
// Created. Construct a TimelineInfo for it.
|
// Created. Construct a TimelineInfo for it.
|
||||||
let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)
|
let timeline_info = build_timeline_info(state, &new_timeline, false, false)
|
||||||
|
.await
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
Ok(Some(TimelineInfo {
|
Ok(Some(timeline_info))
|
||||||
tenant_id,
|
|
||||||
timeline_id: new_timeline.timeline_id,
|
|
||||||
local: Some(local_info),
|
|
||||||
remote: None,
|
|
||||||
}))
|
|
||||||
}
|
}
|
||||||
Ok(None) => Ok(None), // timeline already exists
|
Ok(None) => Ok(None), // timeline already exists
|
||||||
Err(err) => Err(ApiError::InternalServerError(err)),
|
Err(err) => Err(ApiError::InternalServerError(err)),
|
||||||
@@ -208,6 +223,8 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
|||||||
query_param_present(&request, "include-non-incremental-physical-size");
|
query_param_present(&request, "include-non-incremental-physical-size");
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
|
let state = get_state(&request);
|
||||||
|
|
||||||
let timelines = tokio::task::spawn_blocking(move || {
|
let timelines = tokio::task::spawn_blocking(move || {
|
||||||
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
|
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
|
||||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||||
@@ -217,36 +234,18 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
|||||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||||
|
|
||||||
let mut response_data = Vec::with_capacity(timelines.len());
|
let mut response_data = Vec::with_capacity(timelines.len());
|
||||||
for (timeline_id, timeline) in timelines {
|
for timeline in timelines {
|
||||||
let local = match local_timeline_info_from_timeline(
|
let timeline_info = build_timeline_info(
|
||||||
|
state,
|
||||||
&timeline,
|
&timeline,
|
||||||
include_non_incremental_logical_size,
|
include_non_incremental_logical_size,
|
||||||
include_non_incremental_physical_size,
|
include_non_incremental_physical_size,
|
||||||
) {
|
)
|
||||||
Ok(local) => Some(local),
|
.await
|
||||||
Err(e) => {
|
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
|
||||||
error!("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}");
|
.map_err(ApiError::InternalServerError)?;
|
||||||
None
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
response_data.push(TimelineInfo {
|
response_data.push(timeline_info);
|
||||||
tenant_id,
|
|
||||||
timeline_id,
|
|
||||||
local,
|
|
||||||
remote: get_state(&request)
|
|
||||||
.remote_index
|
|
||||||
.read()
|
|
||||||
.await
|
|
||||||
.timeline_entry(&TenantTimelineId {
|
|
||||||
tenant_id,
|
|
||||||
timeline_id,
|
|
||||||
})
|
|
||||||
.map(|remote_entry| RemoteTimelineInfo {
|
|
||||||
remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
|
|
||||||
awaits_download: remote_entry.awaits_download,
|
|
||||||
}),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
json_response(StatusCode::OK, response_data)
|
json_response(StatusCode::OK, response_data)
|
||||||
@@ -265,6 +264,23 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {
|
|||||||
.unwrap_or(false)
|
.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_query_param(request: &Request<Body>, param_name: &str) -> Result<String, ApiError> {
|
||||||
|
request.uri().query().map_or(
|
||||||
|
Err(ApiError::BadRequest(anyhow!("empty query in request"))),
|
||||||
|
|v| {
|
||||||
|
url::form_urlencoded::parse(v.as_bytes())
|
||||||
|
.into_owned()
|
||||||
|
.find(|(k, _)| k == param_name)
|
||||||
|
.map_or(
|
||||||
|
Err(ApiError::BadRequest(anyhow!(
|
||||||
|
"no {param_name} specified in query parameters"
|
||||||
|
))),
|
||||||
|
|(_, v)| Ok(v),
|
||||||
|
)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
@@ -274,59 +290,60 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
|||||||
query_param_present(&request, "include-non-incremental-physical-size");
|
query_param_present(&request, "include-non-incremental-physical-size");
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let (local_timeline_info, remote_timeline_info) = async {
|
let state = get_state(&request);
|
||||||
|
|
||||||
|
let timeline_info = async {
|
||||||
let timeline = tokio::task::spawn_blocking(move || {
|
let timeline = tokio::task::spawn_blocking(move || {
|
||||||
tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id)
|
tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id)
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||||
|
|
||||||
let local_timeline_info = match timeline.and_then(|timeline| {
|
let timeline = timeline.map_err(ApiError::NotFound)?;
|
||||||
local_timeline_info_from_timeline(
|
|
||||||
&timeline,
|
|
||||||
include_non_incremental_logical_size,
|
|
||||||
include_non_incremental_physical_size,
|
|
||||||
)
|
|
||||||
}) {
|
|
||||||
Ok(local_info) => Some(local_info),
|
|
||||||
Err(e) => {
|
|
||||||
error!("Failed to get local timeline info: {e:#}");
|
|
||||||
None
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let remote_timeline_info = {
|
let timeline_info = build_timeline_info(
|
||||||
let remote_index_read = get_state(&request).remote_index.read().await;
|
state,
|
||||||
remote_index_read
|
&timeline,
|
||||||
.timeline_entry(&TenantTimelineId {
|
include_non_incremental_logical_size,
|
||||||
tenant_id,
|
include_non_incremental_physical_size,
|
||||||
timeline_id,
|
)
|
||||||
})
|
.await
|
||||||
.map(|remote_entry| RemoteTimelineInfo {
|
.context("Failed to get local timeline info: {e:#}")
|
||||||
remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
|
.map_err(ApiError::InternalServerError)?;
|
||||||
awaits_download: remote_entry.awaits_download,
|
|
||||||
})
|
Ok::<_, ApiError>(timeline_info)
|
||||||
};
|
|
||||||
Ok::<_, ApiError>((local_timeline_info, remote_timeline_info))
|
|
||||||
}
|
}
|
||||||
.instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id))
|
.instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
if local_timeline_info.is_none() && remote_timeline_info.is_none() {
|
json_response(StatusCode::OK, timeline_info)
|
||||||
Err(ApiError::NotFound(anyhow!(
|
}
|
||||||
"Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely"
|
|
||||||
)))
|
async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
} else {
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
json_response(
|
check_permission(&request, Some(tenant_id))?;
|
||||||
StatusCode::OK,
|
|
||||||
TimelineInfo {
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
tenant_id,
|
let timestamp_raw = get_query_param(&request, "timestamp")?;
|
||||||
timeline_id,
|
let timestamp = humantime::parse_rfc3339(timestamp_raw.as_str())
|
||||||
local: local_timeline_info,
|
.with_context(|| format!("Invalid time: {:?}", timestamp_raw))
|
||||||
remote: remote_timeline_info,
|
.map_err(ApiError::BadRequest)?;
|
||||||
},
|
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
||||||
)
|
|
||||||
}
|
let timeline = tenant_mgr::get_tenant(tenant_id, true)
|
||||||
|
.and_then(|tenant| tenant.get_timeline(timeline_id))
|
||||||
|
.with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
|
||||||
|
.map_err(ApiError::NotFound)?;
|
||||||
|
let result = match timeline
|
||||||
|
.find_lsn_for_timestamp(timestamp_pg)
|
||||||
|
.map_err(ApiError::InternalServerError)?
|
||||||
|
{
|
||||||
|
LsnForTimestamp::Present(lsn) => format!("{}", lsn),
|
||||||
|
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||||
|
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||||
|
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||||
|
};
|
||||||
|
json_response(StatusCode::OK, result)
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create
|
// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create
|
||||||
@@ -337,9 +354,16 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
|||||||
info!("Handling tenant attach {tenant_id}");
|
info!("Handling tenant attach {tenant_id}");
|
||||||
|
|
||||||
tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) {
|
tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) {
|
||||||
Ok(_) => Err(ApiError::Conflict(
|
Ok(tenant) => {
|
||||||
"Tenant is already present locally".to_owned(),
|
if tenant.list_timelines().is_empty() {
|
||||||
)),
|
info!("Attaching to tenant {tenant_id} with zero timelines");
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(ApiError::Conflict(
|
||||||
|
"Tenant is already present locally".to_owned(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
Err(_) => Ok(()),
|
Err(_) => Ok(()),
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
@@ -362,7 +386,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
|||||||
}
|
}
|
||||||
return json_response(StatusCode::ACCEPTED, ());
|
return json_response(StatusCode::ACCEPTED, ());
|
||||||
}
|
}
|
||||||
// no tenant in the index, release the lock to make the potentially lengthy download opetation
|
// no tenant in the index, release the lock to make the potentially lengthy download operation
|
||||||
drop(index_accessor);
|
drop(index_accessor);
|
||||||
|
|
||||||
// download index parts for every tenant timeline
|
// download index parts for every tenant timeline
|
||||||
@@ -514,36 +538,27 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
|||||||
false
|
false
|
||||||
});
|
});
|
||||||
|
|
||||||
let tenant_state = match tenant {
|
let (tenant_state, current_physical_size) = match tenant {
|
||||||
Ok(tenant) => tenant.current_state(),
|
Ok(tenant) => {
|
||||||
|
let timelines = tenant.list_timelines();
|
||||||
|
// Calculate total physical size of all timelines
|
||||||
|
let mut current_physical_size = 0;
|
||||||
|
for timeline in timelines {
|
||||||
|
current_physical_size += timeline.get_physical_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
(tenant.current_state(), Some(current_physical_size))
|
||||||
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Failed to get local tenant state: {e:#}");
|
error!("Failed to get local tenant state: {e:#}");
|
||||||
if has_in_progress_downloads {
|
if has_in_progress_downloads {
|
||||||
TenantState::Paused
|
(TenantState::Paused, None)
|
||||||
} else {
|
} else {
|
||||||
TenantState::Broken
|
(TenantState::Broken, None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let current_physical_size =
|
|
||||||
match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false))
|
|
||||||
.await
|
|
||||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?
|
|
||||||
{
|
|
||||||
Err(err) => {
|
|
||||||
// Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded).
|
|
||||||
// In that case, put a warning message into log and operate normally.
|
|
||||||
warn!("Failed to get local timelines for tenant {tenant_id}: {err}");
|
|
||||||
None
|
|
||||||
}
|
|
||||||
Ok(local_timeline_infos) => Some(
|
|
||||||
local_timeline_infos
|
|
||||||
.into_iter()
|
|
||||||
.fold(0, |acc, x| acc + x.1.current_physical_size.unwrap()),
|
|
||||||
),
|
|
||||||
};
|
|
||||||
|
|
||||||
json_response(
|
json_response(
|
||||||
StatusCode::OK,
|
StatusCode::OK,
|
||||||
TenantInfo {
|
TenantInfo {
|
||||||
@@ -732,7 +747,7 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
|||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(any(feature = "testing", feature = "failpoints"))]
|
#[cfg(feature = "testing")]
|
||||||
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
if !fail::has_failpoints() {
|
if !fail::has_failpoints() {
|
||||||
return Err(ApiError::BadRequest(anyhow!(
|
return Err(ApiError::BadRequest(anyhow!(
|
||||||
@@ -810,9 +825,7 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
|
|||||||
.get_timeline(timeline_id)
|
.get_timeline(timeline_id)
|
||||||
.with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
|
.with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
|
||||||
.map_err(ApiError::NotFound)?;
|
.map_err(ApiError::NotFound)?;
|
||||||
timeline
|
timeline.compact().map_err(ApiError::InternalServerError)?;
|
||||||
.reconstruct()
|
|
||||||
.map_err(ApiError::InternalServerError)?;
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
@@ -903,6 +916,10 @@ pub fn make_router(
|
|||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||||
timeline_detail_handler,
|
timeline_detail_handler,
|
||||||
)
|
)
|
||||||
|
.get(
|
||||||
|
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
|
||||||
|
get_lsn_by_timestamp_handler,
|
||||||
|
)
|
||||||
.put(
|
.put(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
|
"/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
|
||||||
testing_api!("run timeline GC", timeline_gc_handler),
|
testing_api!("run timeline GC", timeline_gc_handler),
|
||||||
|
|||||||
@@ -119,32 +119,6 @@ impl<T> TenantTimelineValues<T> {
|
|||||||
fn new() -> Self {
|
fn new() -> Self {
|
||||||
Self(HashMap::new())
|
Self(HashMap::new())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn with_capacity(capacity: usize) -> Self {
|
|
||||||
Self(HashMap::with_capacity(capacity))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A convenience method to map certain values and omit some of them, if needed.
|
|
||||||
/// Tenants that won't have any timeline entries due to the filtering, will still be preserved
|
|
||||||
/// in the structure.
|
|
||||||
fn filter_map<F, NewT>(self, map: F) -> TenantTimelineValues<NewT>
|
|
||||||
where
|
|
||||||
F: Fn(T) -> Option<NewT>,
|
|
||||||
{
|
|
||||||
let capacity = self.0.len();
|
|
||||||
self.0.into_iter().fold(
|
|
||||||
TenantTimelineValues::<NewT>::with_capacity(capacity),
|
|
||||||
|mut new_values, (tenant_id, old_values)| {
|
|
||||||
let new_timeline_values = new_values.0.entry(tenant_id).or_default();
|
|
||||||
for (timeline_id, old_value) in old_values {
|
|
||||||
if let Some(new_value) = map(old_value) {
|
|
||||||
new_timeline_values.insert(timeline_id, new_value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
new_values
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A suffix to be used during file sync from the remote storage,
|
/// A suffix to be used during file sync from the remote storage,
|
||||||
@@ -181,35 +155,3 @@ mod backoff_defaults_tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use crate::tenant::harness::TIMELINE_ID;
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn tenant_timeline_value_mapping() {
|
|
||||||
let first_tenant = TenantId::generate();
|
|
||||||
let second_tenant = TenantId::generate();
|
|
||||||
assert_ne!(first_tenant, second_tenant);
|
|
||||||
|
|
||||||
let mut initial = TenantTimelineValues::new();
|
|
||||||
initial
|
|
||||||
.0
|
|
||||||
.entry(first_tenant)
|
|
||||||
.or_default()
|
|
||||||
.insert(TIMELINE_ID, "test_value");
|
|
||||||
let _ = initial.0.entry(second_tenant).or_default();
|
|
||||||
assert_eq!(initial.0.len(), 2, "Should have entries for both tenants");
|
|
||||||
|
|
||||||
let filtered = initial.filter_map(|_| None::<&str>).0;
|
|
||||||
assert_eq!(
|
|
||||||
filtered.len(),
|
|
||||||
2,
|
|
||||||
"Should have entries for both tenants even after filtering away all entries"
|
|
||||||
);
|
|
||||||
assert!(filtered.contains_key(&first_tenant));
|
|
||||||
assert!(filtered.contains_key(&second_tenant));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -107,18 +107,20 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|||||||
|
|
||||||
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
|
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
|
||||||
// or in testing they estimate how much we would upload if we did.
|
// or in testing they estimate how much we would upload if we did.
|
||||||
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
|
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
register_int_counter!(
|
register_int_counter_vec!(
|
||||||
"pageserver_created_persistent_files_total",
|
"pageserver_created_persistent_files_total",
|
||||||
"Number of files created that are meant to be uploaded to cloud storage",
|
"Number of files created that are meant to be uploaded to cloud storage",
|
||||||
|
&["tenant_id", "timeline_id"]
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounter> = Lazy::new(|| {
|
static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
register_int_counter!(
|
register_int_counter_vec!(
|
||||||
"pageserver_written_persistent_bytes_total",
|
"pageserver_written_persistent_bytes_total",
|
||||||
"Total bytes written that are meant to be uploaded to cloud storage",
|
"Total bytes written that are meant to be uploaded to cloud storage",
|
||||||
|
&["tenant_id", "timeline_id"]
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -275,11 +277,15 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
|||||||
/// smallest redo processing times. These buckets allow us to measure down
|
/// smallest redo processing times. These buckets allow us to measure down
|
||||||
/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
|
/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
|
||||||
/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
|
/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
|
||||||
|
///
|
||||||
|
/// Values up to 1s are recorded because metrics show that we have redo
|
||||||
|
/// durations and lock times larger than 0.250s.
|
||||||
macro_rules! redo_histogram_time_buckets {
|
macro_rules! redo_histogram_time_buckets {
|
||||||
() => {
|
() => {
|
||||||
vec![
|
vec![
|
||||||
0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
|
0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
|
||||||
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000,
|
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000,
|
||||||
|
1.000_000,
|
||||||
]
|
]
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -294,6 +300,17 @@ macro_rules! redo_histogram_count_buckets {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
macro_rules! redo_bytes_histogram_count_buckets {
|
||||||
|
() => {
|
||||||
|
// powers of (2^.5), from 2^4.5 to 2^15 (22 buckets)
|
||||||
|
// rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too.
|
||||||
|
vec![
|
||||||
|
24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0,
|
||||||
|
2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0,
|
||||||
|
]
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||||
register_histogram!(
|
register_histogram!(
|
||||||
"pageserver_wal_redo_seconds",
|
"pageserver_wal_redo_seconds",
|
||||||
@@ -321,6 +338,15 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||||
|
register_histogram!(
|
||||||
|
"pageserver_wal_redo_bytes_histogram",
|
||||||
|
"Histogram of number of records replayed per redo",
|
||||||
|
redo_bytes_histogram_count_buckets!(),
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||||
register_int_counter!(
|
register_int_counter!(
|
||||||
"pageserver_replayed_wal_records_total",
|
"pageserver_replayed_wal_records_total",
|
||||||
@@ -386,8 +412,12 @@ impl TimelineMetrics {
|
|||||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED.clone();
|
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
|
||||||
let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN.clone();
|
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||||
|
.unwrap();
|
||||||
|
let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
|
||||||
|
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
TimelineMetrics {
|
TimelineMetrics {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
@@ -419,6 +449,8 @@ impl Drop for TimelineMetrics {
|
|||||||
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
|
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
|
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
|
|
||||||
for op in STORAGE_TIME_OPERATIONS {
|
for op in STORAGE_TIME_OPERATIONS {
|
||||||
let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||||
|
|||||||
@@ -36,8 +36,9 @@
|
|||||||
//! mapping is automatically removed and the slot is marked free.
|
//! mapping is automatically removed and the slot is marked free.
|
||||||
//!
|
//!
|
||||||
|
|
||||||
|
use dashmap::mapref::entry::Entry;
|
||||||
|
use dashmap::DashMap;
|
||||||
use std::{
|
use std::{
|
||||||
collections::{hash_map::Entry, HashMap},
|
|
||||||
convert::TryInto,
|
convert::TryInto,
|
||||||
sync::{
|
sync::{
|
||||||
atomic::{AtomicU8, AtomicUsize, Ordering},
|
atomic::{AtomicU8, AtomicUsize, Ordering},
|
||||||
@@ -168,18 +169,11 @@ impl Slot {
|
|||||||
pub struct PageCache {
|
pub struct PageCache {
|
||||||
/// This contains the mapping from the cache key to buffer slot that currently
|
/// This contains the mapping from the cache key to buffer slot that currently
|
||||||
/// contains the page, if any.
|
/// contains the page, if any.
|
||||||
///
|
materialized_page_map: DashMap<MaterializedPageHashKey, Vec<Version>>,
|
||||||
/// TODO: This is protected by a single lock. If that becomes a bottleneck,
|
|
||||||
/// this HashMap can be replaced with a more concurrent version, there are
|
|
||||||
/// plenty of such crates around.
|
|
||||||
///
|
|
||||||
/// If you add support for caching different kinds of objects, each object kind
|
|
||||||
/// can have a separate mapping map, next to this field.
|
|
||||||
materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
|
|
||||||
|
|
||||||
ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
|
ephemeral_page_map: DashMap<(u64, u32), usize>,
|
||||||
|
|
||||||
immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,
|
immutable_page_map: DashMap<(u64, u32), usize>,
|
||||||
|
|
||||||
/// The actual buffers with their metadata.
|
/// The actual buffers with their metadata.
|
||||||
slots: Box<[Slot]>,
|
slots: Box<[Slot]>,
|
||||||
@@ -616,7 +610,7 @@ impl PageCache {
|
|||||||
fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
|
fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
|
||||||
match cache_key {
|
match cache_key {
|
||||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||||
let map = self.materialized_page_map.read().unwrap();
|
let map = &self.materialized_page_map;
|
||||||
let versions = map.get(hash_key)?;
|
let versions = map.get(hash_key)?;
|
||||||
|
|
||||||
let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
|
let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
|
||||||
@@ -629,11 +623,11 @@ impl PageCache {
|
|||||||
Some(version.slot_idx)
|
Some(version.slot_idx)
|
||||||
}
|
}
|
||||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||||
let map = self.ephemeral_page_map.read().unwrap();
|
let map = &self.ephemeral_page_map;
|
||||||
Some(*map.get(&(*file_id, *blkno))?)
|
Some(*map.get(&(*file_id, *blkno))?)
|
||||||
}
|
}
|
||||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||||
let map = self.immutable_page_map.read().unwrap();
|
let map = &self.immutable_page_map;
|
||||||
Some(*map.get(&(*file_id, *blkno))?)
|
Some(*map.get(&(*file_id, *blkno))?)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -646,7 +640,7 @@ impl PageCache {
|
|||||||
fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
|
fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
|
||||||
match key {
|
match key {
|
||||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||||
let map = self.materialized_page_map.read().unwrap();
|
let map = &self.materialized_page_map;
|
||||||
let versions = map.get(hash_key)?;
|
let versions = map.get(hash_key)?;
|
||||||
|
|
||||||
if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
|
if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
|
||||||
@@ -656,11 +650,11 @@ impl PageCache {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||||
let map = self.ephemeral_page_map.read().unwrap();
|
let map = &self.ephemeral_page_map;
|
||||||
Some(*map.get(&(*file_id, *blkno))?)
|
Some(*map.get(&(*file_id, *blkno))?)
|
||||||
}
|
}
|
||||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||||
let map = self.immutable_page_map.read().unwrap();
|
let map = &self.immutable_page_map;
|
||||||
Some(*map.get(&(*file_id, *blkno))?)
|
Some(*map.get(&(*file_id, *blkno))?)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -675,7 +669,7 @@ impl PageCache {
|
|||||||
hash_key: old_hash_key,
|
hash_key: old_hash_key,
|
||||||
lsn: old_lsn,
|
lsn: old_lsn,
|
||||||
} => {
|
} => {
|
||||||
let mut map = self.materialized_page_map.write().unwrap();
|
let map = &self.materialized_page_map;
|
||||||
if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
|
if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
|
||||||
let versions = old_entry.get_mut();
|
let versions = old_entry.get_mut();
|
||||||
|
|
||||||
@@ -690,12 +684,12 @@ impl PageCache {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
let map = &self.ephemeral_page_map;
|
||||||
map.remove(&(*file_id, *blkno))
|
map.remove(&(*file_id, *blkno))
|
||||||
.expect("could not find old key in mapping");
|
.expect("could not find old key in mapping");
|
||||||
}
|
}
|
||||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||||
let mut map = self.immutable_page_map.write().unwrap();
|
let map = &self.immutable_page_map;
|
||||||
map.remove(&(*file_id, *blkno))
|
map.remove(&(*file_id, *blkno))
|
||||||
.expect("could not find old key in mapping");
|
.expect("could not find old key in mapping");
|
||||||
}
|
}
|
||||||
@@ -713,8 +707,8 @@ impl PageCache {
|
|||||||
hash_key: new_key,
|
hash_key: new_key,
|
||||||
lsn: new_lsn,
|
lsn: new_lsn,
|
||||||
} => {
|
} => {
|
||||||
let mut map = self.materialized_page_map.write().unwrap();
|
let map = &self.materialized_page_map;
|
||||||
let versions = map.entry(new_key.clone()).or_default();
|
let mut versions = map.entry(new_key.clone()).or_default();
|
||||||
match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
|
match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
|
||||||
Ok(version_idx) => Some(versions[version_idx].slot_idx),
|
Ok(version_idx) => Some(versions[version_idx].slot_idx),
|
||||||
Err(version_idx) => {
|
Err(version_idx) => {
|
||||||
@@ -730,7 +724,7 @@ impl PageCache {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
let map = &self.ephemeral_page_map;
|
||||||
match map.entry((*file_id, *blkno)) {
|
match map.entry((*file_id, *blkno)) {
|
||||||
Entry::Occupied(entry) => Some(*entry.get()),
|
Entry::Occupied(entry) => Some(*entry.get()),
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
@@ -740,7 +734,7 @@ impl PageCache {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||||
let mut map = self.immutable_page_map.write().unwrap();
|
let map = &self.immutable_page_map;
|
||||||
match map.entry((*file_id, *blkno)) {
|
match map.entry((*file_id, *blkno)) {
|
||||||
Entry::Occupied(entry) => Some(*entry.get()),
|
Entry::Occupied(entry) => Some(*entry.get()),
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
|
|||||||
@@ -12,7 +12,6 @@
|
|||||||
use anyhow::{bail, ensure, Context, Result};
|
use anyhow::{bail, ensure, Context, Result};
|
||||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||||
use futures::{Stream, StreamExt};
|
use futures::{Stream, StreamExt};
|
||||||
use regex::Regex;
|
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::net::TcpListener;
|
use std::net::TcpListener;
|
||||||
use std::str;
|
use std::str;
|
||||||
@@ -35,7 +34,6 @@ use crate::basebackup;
|
|||||||
use crate::config::{PageServerConf, ProfilingConfig};
|
use crate::config::{PageServerConf, ProfilingConfig};
|
||||||
use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
|
use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
|
||||||
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
|
||||||
use crate::profiling::profpoint_start;
|
use crate::profiling::profpoint_start;
|
||||||
use crate::reltag::RelTag;
|
use crate::reltag::RelTag;
|
||||||
use crate::task_mgr;
|
use crate::task_mgr;
|
||||||
@@ -45,7 +43,6 @@ use crate::tenant_mgr;
|
|||||||
use crate::CheckpointConfig;
|
use crate::CheckpointConfig;
|
||||||
|
|
||||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||||
use postgres_ffi::to_pg_timestamp;
|
|
||||||
use postgres_ffi::BLCKSZ;
|
use postgres_ffi::BLCKSZ;
|
||||||
|
|
||||||
// Wrapped in libpq CopyData
|
// Wrapped in libpq CopyData
|
||||||
@@ -1062,33 +1059,6 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
|||||||
Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
|
Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
|
||||||
]))?
|
]))?
|
||||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
} else if query_string.starts_with("get_lsn_by_timestamp ") {
|
|
||||||
// Locate LSN of last transaction with timestamp less or equal than sppecified
|
|
||||||
// TODO lazy static
|
|
||||||
let re = Regex::new(r"^get_lsn_by_timestamp ([[:xdigit:]]+) ([[:xdigit:]]+) '(.*)'$")
|
|
||||||
.unwrap();
|
|
||||||
let caps = re
|
|
||||||
.captures(query_string)
|
|
||||||
.with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?;
|
|
||||||
let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?;
|
|
||||||
let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
|
||||||
let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?;
|
|
||||||
let timestamp_pg = to_pg_timestamp(timestamp);
|
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
|
||||||
|
|
||||||
let timeline = get_local_timeline(tenant_id, timeline_id)?;
|
|
||||||
pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
|
||||||
b"lsn",
|
|
||||||
)]))?;
|
|
||||||
let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? {
|
|
||||||
LsnForTimestamp::Present(lsn) => format!("{}", lsn),
|
|
||||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
|
||||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
|
||||||
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
|
||||||
};
|
|
||||||
pgb.write_message(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
|
|
||||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
|
||||||
} else {
|
} else {
|
||||||
bail!("unknown command");
|
bail!("unknown command");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -169,9 +169,14 @@ use self::{
|
|||||||
upload::{upload_index_part, upload_timeline_layers, UploadedTimeline},
|
upload::{upload_index_part, upload_timeline_layers, UploadedTimeline},
|
||||||
};
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf, exponential_backoff, storage_sync::index::RemoteIndex, task_mgr,
|
config::PageServerConf,
|
||||||
task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, tenant::metadata::TimelineMetadata,
|
exponential_backoff,
|
||||||
tenant_mgr::attach_local_tenants,
|
storage_sync::index::{LayerFileMetadata, RemoteIndex},
|
||||||
|
task_mgr,
|
||||||
|
task_mgr::TaskKind,
|
||||||
|
task_mgr::BACKGROUND_RUNTIME,
|
||||||
|
tenant::metadata::TimelineMetadata,
|
||||||
|
tenant_mgr::{attach_local_tenants, TenantAttachData},
|
||||||
};
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD},
|
metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD},
|
||||||
@@ -188,7 +193,7 @@ static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();
|
|||||||
|
|
||||||
/// A timeline status to share with pageserver's sync counterpart,
|
/// A timeline status to share with pageserver's sync counterpart,
|
||||||
/// after comparing local and remote timeline state.
|
/// after comparing local and remote timeline state.
|
||||||
#[derive(Clone)]
|
#[derive(Clone, PartialEq, Eq)]
|
||||||
pub enum LocalTimelineInitStatus {
|
pub enum LocalTimelineInitStatus {
|
||||||
/// The timeline has every remote layer present locally.
|
/// The timeline has every remote layer present locally.
|
||||||
/// There could be some layers requiring uploading,
|
/// There could be some layers requiring uploading,
|
||||||
@@ -311,7 +316,7 @@ impl SyncQueue {
|
|||||||
|
|
||||||
/// A task to run in the async download/upload loop.
|
/// A task to run in the async download/upload loop.
|
||||||
/// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled.
|
/// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled.
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
enum SyncTask {
|
enum SyncTask {
|
||||||
/// A checkpoint outcome with possible local file updates that need actualization in the remote storage.
|
/// A checkpoint outcome with possible local file updates that need actualization in the remote storage.
|
||||||
/// Not necessary more fresh than the one already uploaded.
|
/// Not necessary more fresh than the one already uploaded.
|
||||||
@@ -422,7 +427,7 @@ impl SyncTaskBatch {
|
|||||||
.extend(new_delete.data.deleted_layers.iter().cloned());
|
.extend(new_delete.data.deleted_layers.iter().cloned());
|
||||||
}
|
}
|
||||||
if let Some(batch_upload) = &mut self.upload {
|
if let Some(batch_upload) = &mut self.upload {
|
||||||
let not_deleted = |layer: &PathBuf| {
|
let not_deleted = |layer: &PathBuf, _: &mut LayerFileMetadata| {
|
||||||
!new_delete.data.layers_to_delete.contains(layer)
|
!new_delete.data.layers_to_delete.contains(layer)
|
||||||
&& !new_delete.data.deleted_layers.contains(layer)
|
&& !new_delete.data.deleted_layers.contains(layer)
|
||||||
};
|
};
|
||||||
@@ -450,21 +455,35 @@ impl SyncTaskBatch {
|
|||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
struct LayersUpload {
|
struct LayersUpload {
|
||||||
/// Layer file path in the pageserver workdir, that were added for the corresponding checkpoint.
|
/// Layer file path in the pageserver workdir, that were added for the corresponding checkpoint.
|
||||||
layers_to_upload: HashSet<PathBuf>,
|
layers_to_upload: HashMap<PathBuf, LayerFileMetadata>,
|
||||||
/// Already uploaded layers. Used to store the data about the uploads between task retries
|
/// Already uploaded layers. Used to store the data about the uploads between task retries
|
||||||
/// and to record the data into the remote index after the task got completed or evicted.
|
/// and to record the data into the remote index after the task got completed or evicted.
|
||||||
uploaded_layers: HashSet<PathBuf>,
|
uploaded_layers: HashMap<PathBuf, LayerFileMetadata>,
|
||||||
metadata: Option<TimelineMetadata>,
|
metadata: Option<TimelineMetadata>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A timeline download task.
|
/// A timeline download task.
|
||||||
/// Does not contain the file list to download, to allow other
|
/// Does not contain the file list to download, to allow other
|
||||||
/// parts of the pageserer code to schedule the task
|
/// parts of the pageserer code to schedule the task
|
||||||
/// without using the remote index or any other ways to list the remote timleine files.
|
/// without using the remote index or any other ways to list the remote timeline files.
|
||||||
/// Skips the files that are already downloaded.
|
/// Skips the files that are already downloaded.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
struct LayersDownload {
|
struct LayersDownload {
|
||||||
layers_to_skip: HashSet<PathBuf>,
|
layers_to_skip: HashSet<PathBuf>,
|
||||||
|
|
||||||
|
/// Paths which have been downloaded, and had their metadata verified or generated.
|
||||||
|
///
|
||||||
|
/// Metadata generation happens when upgrading from past version of `IndexPart`.
|
||||||
|
gathered_metadata: HashMap<PathBuf, LayerFileMetadata>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LayersDownload {
|
||||||
|
fn from_skipped_layers(layers_to_skip: HashSet<PathBuf>) -> Self {
|
||||||
|
LayersDownload {
|
||||||
|
layers_to_skip,
|
||||||
|
gathered_metadata: HashMap::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
@@ -486,7 +505,7 @@ struct LayersDeletion {
|
|||||||
pub fn schedule_layer_upload(
|
pub fn schedule_layer_upload(
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
layers_to_upload: HashSet<PathBuf>,
|
layers_to_upload: HashMap<PathBuf, LayerFileMetadata>,
|
||||||
metadata: Option<TimelineMetadata>,
|
metadata: Option<TimelineMetadata>,
|
||||||
) {
|
) {
|
||||||
let sync_queue = match SYNC_QUEUE.get() {
|
let sync_queue = match SYNC_QUEUE.get() {
|
||||||
@@ -503,7 +522,7 @@ pub fn schedule_layer_upload(
|
|||||||
},
|
},
|
||||||
SyncTask::upload(LayersUpload {
|
SyncTask::upload(LayersUpload {
|
||||||
layers_to_upload,
|
layers_to_upload,
|
||||||
uploaded_layers: HashSet::new(),
|
uploaded_layers: HashMap::new(),
|
||||||
metadata,
|
metadata,
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
@@ -561,18 +580,44 @@ pub fn schedule_layer_download(tenant_id: TenantId, timeline_id: TimelineId) {
|
|||||||
tenant_id,
|
tenant_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
},
|
},
|
||||||
SyncTask::download(LayersDownload {
|
SyncTask::download(LayersDownload::from_skipped_layers(HashSet::new())),
|
||||||
layers_to_skip: HashSet::new(),
|
|
||||||
}),
|
|
||||||
);
|
);
|
||||||
debug!("Download task for tenant {tenant_id}, timeline {timeline_id} sent")
|
debug!("Download task for tenant {tenant_id}, timeline {timeline_id} sent")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Local existing timeline files
|
||||||
|
///
|
||||||
|
/// Values of this type serve different meanings in different contexts. On startup, collected
|
||||||
|
/// timelines come with the full collected information and when signalling readyness to attach
|
||||||
|
/// after completed download. After the download the file information is no longer carried, because
|
||||||
|
/// it is already merged into [`RemoteTimeline`].
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct TimelineLocalFiles(TimelineMetadata, HashMap<PathBuf, LayerFileMetadata>);
|
||||||
|
|
||||||
|
impl TimelineLocalFiles {
|
||||||
|
pub fn metadata(&self) -> &TimelineMetadata {
|
||||||
|
&self.0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Called during startup, for all of the local files with full metadata.
|
||||||
|
pub(crate) fn collected(
|
||||||
|
metadata: TimelineMetadata,
|
||||||
|
timeline_files: HashMap<PathBuf, LayerFileMetadata>,
|
||||||
|
) -> TimelineLocalFiles {
|
||||||
|
TimelineLocalFiles(metadata, timeline_files)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Called near the end of tenant initialization, to signal readyness to attach tenants.
|
||||||
|
pub(crate) fn ready(metadata: TimelineMetadata) -> Self {
|
||||||
|
TimelineLocalFiles(metadata, HashMap::new())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Launch a thread to perform remote storage sync tasks.
|
/// Launch a thread to perform remote storage sync tasks.
|
||||||
/// See module docs for loop step description.
|
/// See module docs for loop step description.
|
||||||
pub fn spawn_storage_sync_task(
|
pub fn spawn_storage_sync_task(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet<PathBuf>)>,
|
local_timeline_files: HashMap<TenantId, HashMap<TimelineId, TimelineLocalFiles>>,
|
||||||
storage: GenericRemoteStorage,
|
storage: GenericRemoteStorage,
|
||||||
max_concurrent_timelines_sync: NonZeroUsize,
|
max_concurrent_timelines_sync: NonZeroUsize,
|
||||||
max_sync_errors: NonZeroU32,
|
max_sync_errors: NonZeroU32,
|
||||||
@@ -595,7 +640,7 @@ pub fn spawn_storage_sync_task(
|
|||||||
let mut keys_for_index_part_downloads = HashSet::new();
|
let mut keys_for_index_part_downloads = HashSet::new();
|
||||||
let mut timelines_to_sync = HashMap::new();
|
let mut timelines_to_sync = HashMap::new();
|
||||||
|
|
||||||
for (tenant_id, timeline_data) in local_timeline_files.0 {
|
for (tenant_id, timeline_data) in local_timeline_files {
|
||||||
if timeline_data.is_empty() {
|
if timeline_data.is_empty() {
|
||||||
info!("got empty tenant {}", tenant_id);
|
info!("got empty tenant {}", tenant_id);
|
||||||
let _ = empty_tenants.0.entry(tenant_id).or_default();
|
let _ = empty_tenants.0.entry(tenant_id).or_default();
|
||||||
@@ -698,7 +743,7 @@ async fn storage_sync_loop(
|
|||||||
"Sync loop step completed, {} new tenant state update(s)",
|
"Sync loop step completed, {} new tenant state update(s)",
|
||||||
updated_tenants.len()
|
updated_tenants.len()
|
||||||
);
|
);
|
||||||
let mut timelines_to_attach = TenantTimelineValues::new();
|
let mut timelines_to_attach = HashMap::new();
|
||||||
let index_accessor = index.read().await;
|
let index_accessor = index.read().await;
|
||||||
for tenant_id in updated_tenants {
|
for tenant_id in updated_tenants {
|
||||||
let tenant_entry = match index_accessor.tenant_entry(&tenant_id) {
|
let tenant_entry = match index_accessor.tenant_entry(&tenant_id) {
|
||||||
@@ -724,12 +769,16 @@ async fn storage_sync_loop(
|
|||||||
// and register them all at once in a tenant for download
|
// and register them all at once in a tenant for download
|
||||||
// to be submitted in a single operation to tenant
|
// to be submitted in a single operation to tenant
|
||||||
// so it can apply them at once to internal timeline map.
|
// so it can apply them at once to internal timeline map.
|
||||||
timelines_to_attach.0.insert(
|
timelines_to_attach.insert(
|
||||||
tenant_id,
|
tenant_id,
|
||||||
tenant_entry
|
TenantAttachData::Ready(
|
||||||
.iter()
|
tenant_entry
|
||||||
.map(|(&id, entry)| (id, entry.metadata.clone()))
|
.iter()
|
||||||
.collect(),
|
.map(|(&id, entry)| {
|
||||||
|
(id, TimelineLocalFiles::ready(entry.metadata.clone()))
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -971,15 +1020,27 @@ async fn download_timeline_data(
|
|||||||
}
|
}
|
||||||
DownloadedTimeline::Successful(mut download_data) => {
|
DownloadedTimeline::Successful(mut download_data) => {
|
||||||
match update_local_metadata(conf, sync_id, current_remote_timeline).await {
|
match update_local_metadata(conf, sync_id, current_remote_timeline).await {
|
||||||
Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
|
Ok(()) => {
|
||||||
Ok(()) => {
|
let mut g = index.write().await;
|
||||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
|
|
||||||
return DownloadStatus::Downloaded;
|
match g.set_awaits_download(&sync_id, false) {
|
||||||
}
|
Ok(()) => {
|
||||||
Err(e) => {
|
let timeline = g
|
||||||
error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
|
.timeline_entry_mut(&sync_id)
|
||||||
}
|
.expect("set_awaits_download verified existence");
|
||||||
},
|
|
||||||
|
timeline.merge_metadata_from_downloaded(
|
||||||
|
&download_data.data.gathered_metadata,
|
||||||
|
);
|
||||||
|
|
||||||
|
register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
|
||||||
|
return DownloadStatus::Downloaded;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Failed to update local timeline metadata: {e:?}");
|
error!("Failed to update local timeline metadata: {e:?}");
|
||||||
download_data.retries += 1;
|
download_data.retries += 1;
|
||||||
@@ -1182,11 +1243,18 @@ async fn update_remote_data(
|
|||||||
}
|
}
|
||||||
if upload_failed {
|
if upload_failed {
|
||||||
existing_entry.add_upload_failures(
|
existing_entry.add_upload_failures(
|
||||||
uploaded_data.layers_to_upload.iter().cloned(),
|
uploaded_data
|
||||||
|
.layers_to_upload
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k.to_owned(), v.to_owned())),
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
existing_entry
|
existing_entry.add_timeline_layers(
|
||||||
.add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned());
|
uploaded_data
|
||||||
|
.uploaded_layers
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k.to_owned(), v.to_owned())),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
RemoteDataUpdate::Delete(layers_to_remove) => {
|
RemoteDataUpdate::Delete(layers_to_remove) => {
|
||||||
@@ -1206,11 +1274,19 @@ async fn update_remote_data(
|
|||||||
};
|
};
|
||||||
let mut new_remote_timeline = RemoteTimeline::new(new_metadata.clone());
|
let mut new_remote_timeline = RemoteTimeline::new(new_metadata.clone());
|
||||||
if upload_failed {
|
if upload_failed {
|
||||||
new_remote_timeline
|
new_remote_timeline.add_upload_failures(
|
||||||
.add_upload_failures(uploaded_data.layers_to_upload.iter().cloned());
|
uploaded_data
|
||||||
|
.layers_to_upload
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k.to_owned(), v.to_owned())),
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
new_remote_timeline
|
new_remote_timeline.add_timeline_layers(
|
||||||
.add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned());
|
uploaded_data
|
||||||
|
.uploaded_layers
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k.to_owned(), v.to_owned())),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
index_accessor.add_timeline_entry(sync_id, new_remote_timeline.clone());
|
index_accessor.add_timeline_entry(sync_id, new_remote_timeline.clone());
|
||||||
@@ -1258,13 +1334,14 @@ async fn validate_task_retries(
|
|||||||
fn schedule_first_sync_tasks(
|
fn schedule_first_sync_tasks(
|
||||||
index: &mut RemoteTimelineIndex,
|
index: &mut RemoteTimelineIndex,
|
||||||
sync_queue: &SyncQueue,
|
sync_queue: &SyncQueue,
|
||||||
local_timeline_files: HashMap<TenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
|
local_timeline_files: HashMap<TenantTimelineId, TimelineLocalFiles>,
|
||||||
) -> TenantTimelineValues<LocalTimelineInitStatus> {
|
) -> TenantTimelineValues<LocalTimelineInitStatus> {
|
||||||
let mut local_timeline_init_statuses = TenantTimelineValues::new();
|
let mut local_timeline_init_statuses = TenantTimelineValues::new();
|
||||||
|
|
||||||
let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len());
|
let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len());
|
||||||
|
|
||||||
for (sync_id, (local_metadata, local_files)) in local_timeline_files {
|
for (sync_id, local_timeline) in local_timeline_files {
|
||||||
|
let TimelineLocalFiles(local_metadata, local_files) = local_timeline;
|
||||||
match index.timeline_entry_mut(&sync_id) {
|
match index.timeline_entry_mut(&sync_id) {
|
||||||
Some(remote_timeline) => {
|
Some(remote_timeline) => {
|
||||||
let (timeline_status, awaits_download) = compare_local_and_remote_timeline(
|
let (timeline_status, awaits_download) = compare_local_and_remote_timeline(
|
||||||
@@ -1308,7 +1385,7 @@ fn schedule_first_sync_tasks(
|
|||||||
sync_id,
|
sync_id,
|
||||||
SyncTask::upload(LayersUpload {
|
SyncTask::upload(LayersUpload {
|
||||||
layers_to_upload: local_files,
|
layers_to_upload: local_files,
|
||||||
uploaded_layers: HashSet::new(),
|
uploaded_layers: HashMap::new(),
|
||||||
metadata: Some(local_metadata.clone()),
|
metadata: Some(local_metadata.clone()),
|
||||||
}),
|
}),
|
||||||
));
|
));
|
||||||
@@ -1335,20 +1412,46 @@ fn compare_local_and_remote_timeline(
|
|||||||
new_sync_tasks: &mut VecDeque<(TenantTimelineId, SyncTask)>,
|
new_sync_tasks: &mut VecDeque<(TenantTimelineId, SyncTask)>,
|
||||||
sync_id: TenantTimelineId,
|
sync_id: TenantTimelineId,
|
||||||
local_metadata: TimelineMetadata,
|
local_metadata: TimelineMetadata,
|
||||||
local_files: HashSet<PathBuf>,
|
local_files: HashMap<PathBuf, LayerFileMetadata>,
|
||||||
remote_entry: &RemoteTimeline,
|
remote_entry: &RemoteTimeline,
|
||||||
) -> (LocalTimelineInitStatus, bool) {
|
) -> (LocalTimelineInitStatus, bool) {
|
||||||
let _entered = info_span!("compare_local_and_remote_timeline", sync_id = %sync_id).entered();
|
let _entered = info_span!("compare_local_and_remote_timeline", sync_id = %sync_id).entered();
|
||||||
|
|
||||||
let remote_files = remote_entry.stored_files();
|
let needed_to_download_files = remote_entry
|
||||||
|
.stored_files()
|
||||||
|
.iter()
|
||||||
|
.filter_map(|(layer_file, remote_metadata)| {
|
||||||
|
if let Some(local_metadata) = local_files.get(layer_file) {
|
||||||
|
match (remote_metadata.file_size(), local_metadata.file_size()) {
|
||||||
|
(Some(x), Some(y)) if x == y => { None },
|
||||||
|
(None, Some(_)) => {
|
||||||
|
// upgrading from an earlier IndexPart without metadata
|
||||||
|
None
|
||||||
|
},
|
||||||
|
_ => {
|
||||||
|
// having to deal with other than (Some(x), Some(y)) where x != y here is a
|
||||||
|
// bummer, but see #2582 and #2610 for attempts and discussion.
|
||||||
|
warn!("Redownloading locally existing {layer_file:?} due to size mismatch, size on index: {:?}, on disk: {:?}", remote_metadata.file_size(), local_metadata.file_size());
|
||||||
|
Some(layer_file)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// doesn't exist locally
|
||||||
|
Some(layer_file)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
|
||||||
let number_of_layers_to_download = remote_files.difference(&local_files).count();
|
let (initial_timeline_status, awaits_download) = if !needed_to_download_files.is_empty() {
|
||||||
let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 {
|
|
||||||
new_sync_tasks.push_back((
|
new_sync_tasks.push_back((
|
||||||
sync_id,
|
sync_id,
|
||||||
SyncTask::download(LayersDownload {
|
SyncTask::download(LayersDownload::from_skipped_layers(
|
||||||
layers_to_skip: local_files.clone(),
|
local_files
|
||||||
}),
|
.keys()
|
||||||
|
.filter(|path| !needed_to_download_files.contains(path))
|
||||||
|
.cloned()
|
||||||
|
.collect(),
|
||||||
|
)),
|
||||||
));
|
));
|
||||||
info!("NeedsSync");
|
info!("NeedsSync");
|
||||||
(LocalTimelineInitStatus::NeedsSync, true)
|
(LocalTimelineInitStatus::NeedsSync, true)
|
||||||
@@ -1363,15 +1466,22 @@ fn compare_local_and_remote_timeline(
|
|||||||
};
|
};
|
||||||
|
|
||||||
let layers_to_upload = local_files
|
let layers_to_upload = local_files
|
||||||
.difference(remote_files)
|
.iter()
|
||||||
.cloned()
|
.filter_map(|(local_file, metadata)| {
|
||||||
.collect::<HashSet<_>>();
|
if !remote_entry.stored_files().contains_key(local_file) {
|
||||||
|
Some((local_file.to_owned(), metadata.to_owned()))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<HashMap<_, _>>();
|
||||||
|
|
||||||
if !layers_to_upload.is_empty() {
|
if !layers_to_upload.is_empty() {
|
||||||
new_sync_tasks.push_back((
|
new_sync_tasks.push_back((
|
||||||
sync_id,
|
sync_id,
|
||||||
SyncTask::upload(LayersUpload {
|
SyncTask::upload(LayersUpload {
|
||||||
layers_to_upload,
|
layers_to_upload,
|
||||||
uploaded_layers: HashSet::new(),
|
uploaded_layers: HashMap::new(),
|
||||||
metadata: Some(local_metadata),
|
metadata: Some(local_metadata),
|
||||||
}),
|
}),
|
||||||
));
|
));
|
||||||
@@ -1427,11 +1537,12 @@ mod test_utils {
|
|||||||
let timeline_path = harness.timeline_path(&timeline_id);
|
let timeline_path = harness.timeline_path(&timeline_id);
|
||||||
fs::create_dir_all(&timeline_path).await?;
|
fs::create_dir_all(&timeline_path).await?;
|
||||||
|
|
||||||
let mut layers_to_upload = HashSet::with_capacity(filenames.len());
|
let mut layers_to_upload = HashMap::with_capacity(filenames.len());
|
||||||
for &file in filenames {
|
for &file in filenames {
|
||||||
let file_path = timeline_path.join(file);
|
let file_path = timeline_path.join(file);
|
||||||
fs::write(&file_path, dummy_contents(file).into_bytes()).await?;
|
fs::write(&file_path, dummy_contents(file).into_bytes()).await?;
|
||||||
layers_to_upload.insert(file_path);
|
let metadata = LayerFileMetadata::new(file_path.metadata()?.len());
|
||||||
|
layers_to_upload.insert(file_path, metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
fs::write(
|
fs::write(
|
||||||
@@ -1442,7 +1553,7 @@ mod test_utils {
|
|||||||
|
|
||||||
Ok(LayersUpload {
|
Ok(LayersUpload {
|
||||||
layers_to_upload,
|
layers_to_upload,
|
||||||
uploaded_layers: HashSet::new(),
|
uploaded_layers: HashMap::new(),
|
||||||
metadata: Some(metadata),
|
metadata: Some(metadata),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -1497,12 +1608,13 @@ mod tests {
|
|||||||
assert!(sync_id_2 != sync_id_3);
|
assert!(sync_id_2 != sync_id_3);
|
||||||
assert!(sync_id_3 != TEST_SYNC_ID);
|
assert!(sync_id_3 != TEST_SYNC_ID);
|
||||||
|
|
||||||
let download_task = SyncTask::download(LayersDownload {
|
let download_task =
|
||||||
layers_to_skip: HashSet::from([PathBuf::from("sk")]),
|
SyncTask::download(LayersDownload::from_skipped_layers(HashSet::from([
|
||||||
});
|
PathBuf::from("sk"),
|
||||||
|
])));
|
||||||
let upload_task = SyncTask::upload(LayersUpload {
|
let upload_task = SyncTask::upload(LayersUpload {
|
||||||
layers_to_upload: HashSet::from([PathBuf::from("up")]),
|
layers_to_upload: HashMap::from([(PathBuf::from("up"), LayerFileMetadata::new(123))]),
|
||||||
uploaded_layers: HashSet::from([PathBuf::from("upl")]),
|
uploaded_layers: HashMap::from([(PathBuf::from("upl"), LayerFileMetadata::new(123))]),
|
||||||
metadata: Some(dummy_metadata(Lsn(2))),
|
metadata: Some(dummy_metadata(Lsn(2))),
|
||||||
});
|
});
|
||||||
let delete_task = SyncTask::delete(LayersDeletion {
|
let delete_task = SyncTask::delete(LayersDeletion {
|
||||||
@@ -1546,12 +1658,10 @@ mod tests {
|
|||||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||||
assert_eq!(sync_queue.len(), 0);
|
assert_eq!(sync_queue.len(), 0);
|
||||||
|
|
||||||
let download = LayersDownload {
|
let download = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk")]));
|
||||||
layers_to_skip: HashSet::from([PathBuf::from("sk")]),
|
|
||||||
};
|
|
||||||
let upload = LayersUpload {
|
let upload = LayersUpload {
|
||||||
layers_to_upload: HashSet::from([PathBuf::from("up")]),
|
layers_to_upload: HashMap::from([(PathBuf::from("up"), LayerFileMetadata::new(123))]),
|
||||||
uploaded_layers: HashSet::from([PathBuf::from("upl")]),
|
uploaded_layers: HashMap::from([(PathBuf::from("upl"), LayerFileMetadata::new(123))]),
|
||||||
metadata: Some(dummy_metadata(Lsn(2))),
|
metadata: Some(dummy_metadata(Lsn(2))),
|
||||||
};
|
};
|
||||||
let delete = LayersDeletion {
|
let delete = LayersDeletion {
|
||||||
@@ -1599,18 +1709,10 @@ mod tests {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn same_task_id_same_tasks_batch() {
|
async fn same_task_id_same_tasks_batch() {
|
||||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(1).unwrap());
|
let sync_queue = SyncQueue::new(NonZeroUsize::new(1).unwrap());
|
||||||
let download_1 = LayersDownload {
|
let download_1 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk1")]));
|
||||||
layers_to_skip: HashSet::from([PathBuf::from("sk1")]),
|
let download_2 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk2")]));
|
||||||
};
|
let download_3 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk3")]));
|
||||||
let download_2 = LayersDownload {
|
let download_4 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk4")]));
|
||||||
layers_to_skip: HashSet::from([PathBuf::from("sk2")]),
|
|
||||||
};
|
|
||||||
let download_3 = LayersDownload {
|
|
||||||
layers_to_skip: HashSet::from([PathBuf::from("sk3")]),
|
|
||||||
};
|
|
||||||
let download_4 = LayersDownload {
|
|
||||||
layers_to_skip: HashSet::from([PathBuf::from("sk4")]),
|
|
||||||
};
|
|
||||||
|
|
||||||
let sync_id_2 = TenantTimelineId {
|
let sync_id_2 = TenantTimelineId {
|
||||||
tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")),
|
tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")),
|
||||||
@@ -1634,15 +1736,15 @@ mod tests {
|
|||||||
Some(SyncTaskBatch {
|
Some(SyncTaskBatch {
|
||||||
download: Some(SyncData {
|
download: Some(SyncData {
|
||||||
retries: 0,
|
retries: 0,
|
||||||
data: LayersDownload {
|
data: LayersDownload::from_skipped_layers(
|
||||||
layers_to_skip: {
|
{
|
||||||
let mut set = HashSet::new();
|
let mut set = HashSet::new();
|
||||||
set.extend(download_1.layers_to_skip.into_iter());
|
set.extend(download_1.layers_to_skip.into_iter());
|
||||||
set.extend(download_2.layers_to_skip.into_iter());
|
set.extend(download_2.layers_to_skip.into_iter());
|
||||||
set.extend(download_4.layers_to_skip.into_iter());
|
set.extend(download_4.layers_to_skip.into_iter());
|
||||||
set
|
set
|
||||||
},
|
},
|
||||||
}
|
)
|
||||||
}),
|
}),
|
||||||
upload: None,
|
upload: None,
|
||||||
delete: None,
|
delete: None,
|
||||||
@@ -1658,4 +1760,148 @@ mod tests {
|
|||||||
"Should have one task left out of the batch"
|
"Should have one task left out of the batch"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mod local_and_remote_comparisons {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn ready() {
|
||||||
|
let mut new_sync_tasks = VecDeque::default();
|
||||||
|
let sync_id = TenantTimelineId::generate();
|
||||||
|
let local_metadata = dummy_metadata(0x02.into());
|
||||||
|
let local_files =
|
||||||
|
HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
|
||||||
|
let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
|
||||||
|
remote_entry
|
||||||
|
.add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
|
||||||
|
|
||||||
|
let (status, sync_needed) = compare_local_and_remote_timeline(
|
||||||
|
&mut new_sync_tasks,
|
||||||
|
sync_id,
|
||||||
|
local_metadata.clone(),
|
||||||
|
local_files,
|
||||||
|
&remote_entry,
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
status,
|
||||||
|
LocalTimelineInitStatus::LocallyComplete(local_metadata)
|
||||||
|
);
|
||||||
|
assert!(!sync_needed);
|
||||||
|
|
||||||
|
assert!(new_sync_tasks.is_empty(), "{:?}", new_sync_tasks);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn needs_download() {
|
||||||
|
let mut new_sync_tasks = VecDeque::default();
|
||||||
|
let sync_id = TenantTimelineId::generate();
|
||||||
|
let local_metadata = dummy_metadata(0x02.into());
|
||||||
|
let local_files = HashMap::default();
|
||||||
|
let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
|
||||||
|
remote_entry
|
||||||
|
.add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
|
||||||
|
|
||||||
|
let (status, sync_needed) = compare_local_and_remote_timeline(
|
||||||
|
&mut new_sync_tasks,
|
||||||
|
sync_id,
|
||||||
|
local_metadata,
|
||||||
|
local_files.clone(),
|
||||||
|
&remote_entry,
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(status, LocalTimelineInitStatus::NeedsSync);
|
||||||
|
assert!(sync_needed);
|
||||||
|
|
||||||
|
let new_sync_tasks = new_sync_tasks.into_iter().collect::<Vec<_>>();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
&new_sync_tasks,
|
||||||
|
&[(
|
||||||
|
sync_id,
|
||||||
|
SyncTask::download(LayersDownload::from_skipped_layers(
|
||||||
|
local_files.keys().cloned().collect()
|
||||||
|
))
|
||||||
|
)]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn redownload_is_not_needed_on_upgrade() {
|
||||||
|
// originally the implementation missed the `(None, Some(_))` case in the match, and
|
||||||
|
// proceeded to always redownload if the remote metadata was not available.
|
||||||
|
|
||||||
|
let mut new_sync_tasks = VecDeque::default();
|
||||||
|
let sync_id = TenantTimelineId::generate();
|
||||||
|
|
||||||
|
let local_metadata = dummy_metadata(0x02.into());
|
||||||
|
|
||||||
|
// type system would in general allow that LayerFileMetadata would be created with
|
||||||
|
// file_size: None, however `LayerFileMetadata::default` is only allowed from tests,
|
||||||
|
// and so everywhere within the system valid LayerFileMetadata is being created, it is
|
||||||
|
// created through `::new`.
|
||||||
|
let local_files =
|
||||||
|
HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
|
||||||
|
|
||||||
|
let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
|
||||||
|
|
||||||
|
// RemoteTimeline is constructed out of an older version IndexPart, which didn't carry
|
||||||
|
// any metadata.
|
||||||
|
remote_entry
|
||||||
|
.add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::default())]);
|
||||||
|
|
||||||
|
let (status, sync_needed) = compare_local_and_remote_timeline(
|
||||||
|
&mut new_sync_tasks,
|
||||||
|
sync_id,
|
||||||
|
local_metadata.clone(),
|
||||||
|
local_files,
|
||||||
|
&remote_entry,
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
status,
|
||||||
|
LocalTimelineInitStatus::LocallyComplete(local_metadata)
|
||||||
|
);
|
||||||
|
assert!(!sync_needed);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn needs_upload() {
|
||||||
|
let mut new_sync_tasks = VecDeque::default();
|
||||||
|
let sync_id = TenantTimelineId::generate();
|
||||||
|
let local_metadata = dummy_metadata(0x02.into());
|
||||||
|
let local_files =
|
||||||
|
HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
|
||||||
|
let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
|
||||||
|
remote_entry.add_timeline_layers([]);
|
||||||
|
|
||||||
|
let (status, sync_needed) = compare_local_and_remote_timeline(
|
||||||
|
&mut new_sync_tasks,
|
||||||
|
sync_id,
|
||||||
|
local_metadata.clone(),
|
||||||
|
local_files.clone(),
|
||||||
|
&remote_entry,
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
status,
|
||||||
|
LocalTimelineInitStatus::LocallyComplete(local_metadata.clone())
|
||||||
|
);
|
||||||
|
assert!(!sync_needed);
|
||||||
|
|
||||||
|
let new_sync_tasks = new_sync_tasks.into_iter().collect::<Vec<_>>();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
&new_sync_tasks,
|
||||||
|
&[(
|
||||||
|
sync_id,
|
||||||
|
SyncTask::upload(LayersUpload {
|
||||||
|
layers_to_upload: local_files,
|
||||||
|
uploaded_layers: HashMap::default(),
|
||||||
|
metadata: Some(local_metadata),
|
||||||
|
})
|
||||||
|
)]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -171,7 +171,7 @@ mod tests {
|
|||||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||||
let timeline_upload =
|
let timeline_upload =
|
||||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||||
for local_path in timeline_upload.layers_to_upload {
|
for (local_path, _metadata) in timeline_upload.layers_to_upload {
|
||||||
let remote_path =
|
let remote_path =
|
||||||
local_storage.resolve_in_storage(&local_storage.remote_object_id(&local_path)?)?;
|
local_storage.resolve_in_storage(&local_storage.remote_object_id(&local_path)?)?;
|
||||||
let remote_parent_dir = remote_path.parent().unwrap();
|
let remote_parent_dir = remote_path.parent().unwrap();
|
||||||
|
|||||||
@@ -16,7 +16,11 @@ use tokio::{
|
|||||||
};
|
};
|
||||||
use tracing::{debug, error, info, warn};
|
use tracing::{debug, error, info, warn};
|
||||||
|
|
||||||
use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX};
|
use crate::{
|
||||||
|
config::PageServerConf,
|
||||||
|
storage_sync::{index::LayerFileMetadata, SyncTask},
|
||||||
|
TEMP_FILE_SUFFIX,
|
||||||
|
};
|
||||||
use utils::{
|
use utils::{
|
||||||
crashsafe_dir::path_with_suffix_extension,
|
crashsafe_dir::path_with_suffix_extension,
|
||||||
id::{TenantId, TenantTimelineId, TimelineId},
|
id::{TenantId, TenantTimelineId, TimelineId},
|
||||||
@@ -219,8 +223,14 @@ pub(super) async fn download_timeline_layers<'a>(
|
|||||||
|
|
||||||
let layers_to_download = remote_timeline
|
let layers_to_download = remote_timeline
|
||||||
.stored_files()
|
.stored_files()
|
||||||
.difference(&download.layers_to_skip)
|
.iter()
|
||||||
.cloned()
|
.filter_map(|(layer_path, metadata)| {
|
||||||
|
if !download.layers_to_skip.contains(layer_path) {
|
||||||
|
Some((layer_path.to_owned(), metadata.to_owned()))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
debug!("Layers to download: {layers_to_download:?}");
|
debug!("Layers to download: {layers_to_download:?}");
|
||||||
@@ -233,89 +243,129 @@ pub(super) async fn download_timeline_layers<'a>(
|
|||||||
|
|
||||||
let mut download_tasks = layers_to_download
|
let mut download_tasks = layers_to_download
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|layer_destination_path| async move {
|
.map(|(layer_destination_path, metadata)| async move {
|
||||||
if layer_destination_path.exists() {
|
|
||||||
debug!(
|
|
||||||
"Layer already exists locally, skipping download: {}",
|
|
||||||
layer_destination_path.display()
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
|
||||||
// The sequence:
|
|
||||||
// write(tmp)
|
|
||||||
// fsync(tmp)
|
|
||||||
// rename(tmp, new)
|
|
||||||
// fsync(new)
|
|
||||||
// fsync(parent)
|
|
||||||
// For more context about durable_rename check this email from postgres mailing list:
|
|
||||||
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
|
||||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
|
||||||
let temp_file_path =
|
|
||||||
path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX);
|
|
||||||
|
|
||||||
let mut destination_file =
|
match layer_destination_path.metadata() {
|
||||||
fs::File::create(&temp_file_path).await.with_context(|| {
|
Ok(m) if m.is_file() => {
|
||||||
format!(
|
// the file exists from earlier round when we failed after renaming it as
|
||||||
"Failed to create a destination file for layer '{}'",
|
// layer_destination_path
|
||||||
temp_file_path.display()
|
let verified = if let Some(expected) = metadata.file_size() {
|
||||||
)
|
m.len() == expected
|
||||||
})?;
|
} else {
|
||||||
|
// behaviour before recording metadata was to accept any existing
|
||||||
|
true
|
||||||
|
};
|
||||||
|
|
||||||
let mut layer_download = storage.download_storage_object(None, &layer_destination_path)
|
if verified {
|
||||||
.await
|
debug!(
|
||||||
.with_context(|| {
|
"Layer already exists locally, skipping download: {}",
|
||||||
format!(
|
layer_destination_path.display()
|
||||||
"Failed to initiate the download the layer for {sync_id} into file '{}'",
|
);
|
||||||
temp_file_path.display()
|
return Ok((layer_destination_path, LayerFileMetadata::new(m.len())))
|
||||||
)
|
} else {
|
||||||
})?;
|
// no need to remove it, it will be overwritten by fs::rename
|
||||||
io::copy(&mut layer_download.download_stream, &mut destination_file)
|
// after successful download
|
||||||
.await
|
warn!("Downloaded layer exists already but layer file metadata mismatches: {}, metadata {:?}", layer_destination_path.display(), metadata);
|
||||||
.with_context(|| {
|
}
|
||||||
format!(
|
}
|
||||||
"Failed to download the layer for {sync_id} into file '{}'",
|
Ok(m) => {
|
||||||
temp_file_path.display()
|
return Err(anyhow::anyhow!("Downloaded layer destination exists but is not a file: {m:?}, target needs to be removed/archived manually: {layer_destination_path:?}"));
|
||||||
)
|
}
|
||||||
})?;
|
Err(_) => {
|
||||||
|
// behave as the file didn't exist
|
||||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
}
|
||||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
|
||||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
|
||||||
// you should call flush before dropping it.
|
|
||||||
//
|
|
||||||
// From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because
|
|
||||||
// we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations.
|
|
||||||
// But for additional safety let's check/wait for any pending operations.
|
|
||||||
destination_file.flush().await.with_context(|| {
|
|
||||||
format!(
|
|
||||||
"failed to flush source file at {}",
|
|
||||||
temp_file_path.display()
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
|
|
||||||
// not using sync_data because it can lose file size update
|
|
||||||
destination_file.sync_all().await.with_context(|| {
|
|
||||||
format!(
|
|
||||||
"failed to fsync source file at {}",
|
|
||||||
temp_file_path.display()
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
drop(destination_file);
|
|
||||||
|
|
||||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
|
||||||
anyhow::bail!("remote-storage-download-pre-rename failpoint triggered")
|
|
||||||
});
|
|
||||||
|
|
||||||
fs::rename(&temp_file_path, &layer_destination_path).await?;
|
|
||||||
|
|
||||||
fsync_path(&layer_destination_path).await.with_context(|| {
|
|
||||||
format!(
|
|
||||||
"Cannot fsync layer destination path {}",
|
|
||||||
layer_destination_path.display(),
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
}
|
}
|
||||||
Ok::<_, anyhow::Error>(layer_destination_path)
|
|
||||||
|
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||||
|
// The sequence:
|
||||||
|
// write(tmp)
|
||||||
|
// fsync(tmp)
|
||||||
|
// rename(tmp, new)
|
||||||
|
// fsync(new)
|
||||||
|
// fsync(parent)
|
||||||
|
// For more context about durable_rename check this email from postgres mailing list:
|
||||||
|
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
||||||
|
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||||
|
let temp_file_path =
|
||||||
|
path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX);
|
||||||
|
|
||||||
|
// TODO: this doesn't use the cached fd for some reason?
|
||||||
|
let mut destination_file =
|
||||||
|
fs::File::create(&temp_file_path).await.with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to create a destination file for layer '{}'",
|
||||||
|
temp_file_path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut layer_download = storage.download_storage_object(None, &layer_destination_path)
|
||||||
|
.await
|
||||||
|
.with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to initiate the download the layer for {sync_id} into file '{}'",
|
||||||
|
temp_file_path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let bytes_amount = io::copy(&mut layer_download.download_stream, &mut destination_file)
|
||||||
|
.await
|
||||||
|
.with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to download the layer for {sync_id} into file '{}'",
|
||||||
|
temp_file_path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||||
|
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||||
|
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||||
|
// you should call flush before dropping it.
|
||||||
|
//
|
||||||
|
// From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because
|
||||||
|
// we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations.
|
||||||
|
// But for additional safety let's check/wait for any pending operations.
|
||||||
|
destination_file.flush().await.with_context(|| {
|
||||||
|
format!(
|
||||||
|
"failed to flush source file at {}",
|
||||||
|
temp_file_path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
match metadata.file_size() {
|
||||||
|
Some(expected) if expected != bytes_amount => {
|
||||||
|
anyhow::bail!(
|
||||||
|
"According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
|
||||||
|
temp_file_path.display()
|
||||||
|
);
|
||||||
|
},
|
||||||
|
Some(_) | None => {
|
||||||
|
// matches, or upgrading from an earlier IndexPart version
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// not using sync_data because it can lose file size update
|
||||||
|
destination_file.sync_all().await.with_context(|| {
|
||||||
|
format!(
|
||||||
|
"failed to fsync source file at {}",
|
||||||
|
temp_file_path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
drop(destination_file);
|
||||||
|
|
||||||
|
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||||
|
anyhow::bail!("remote-storage-download-pre-rename failpoint triggered")
|
||||||
|
});
|
||||||
|
|
||||||
|
fs::rename(&temp_file_path, &layer_destination_path).await?;
|
||||||
|
|
||||||
|
fsync_path(&layer_destination_path).await.with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Cannot fsync layer destination path {}",
|
||||||
|
layer_destination_path.display(),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok::<_, anyhow::Error>((layer_destination_path, LayerFileMetadata::new(bytes_amount)))
|
||||||
})
|
})
|
||||||
.collect::<FuturesUnordered<_>>();
|
.collect::<FuturesUnordered<_>>();
|
||||||
|
|
||||||
@@ -324,9 +374,12 @@ pub(super) async fn download_timeline_layers<'a>(
|
|||||||
let mut undo = HashSet::new();
|
let mut undo = HashSet::new();
|
||||||
while let Some(download_result) = download_tasks.next().await {
|
while let Some(download_result) = download_tasks.next().await {
|
||||||
match download_result {
|
match download_result {
|
||||||
Ok(downloaded_path) => {
|
Ok((downloaded_path, metadata)) => {
|
||||||
undo.insert(downloaded_path.clone());
|
undo.insert(downloaded_path.clone());
|
||||||
download.layers_to_skip.insert(downloaded_path);
|
download.layers_to_skip.insert(downloaded_path.clone());
|
||||||
|
// what if the key existed already? ignore, because then we would had
|
||||||
|
// downloaded a partial file, and had to retry
|
||||||
|
download.gathered_metadata.insert(downloaded_path, metadata);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
errors_happened = true;
|
errors_happened = true;
|
||||||
@@ -349,6 +402,8 @@ pub(super) async fn download_timeline_layers<'a>(
|
|||||||
);
|
);
|
||||||
for item in undo {
|
for item in undo {
|
||||||
download.layers_to_skip.remove(&item);
|
download.layers_to_skip.remove(&item);
|
||||||
|
// intentionally don't clear the gathered_metadata because it exists for fsync_path
|
||||||
|
// failure on parent directory
|
||||||
}
|
}
|
||||||
errors_happened = true;
|
errors_happened = true;
|
||||||
}
|
}
|
||||||
@@ -453,9 +508,9 @@ mod tests {
|
|||||||
let timeline_upload =
|
let timeline_upload =
|
||||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||||
|
|
||||||
for local_path in timeline_upload.layers_to_upload {
|
for local_path in timeline_upload.layers_to_upload.keys() {
|
||||||
let remote_path =
|
let remote_path =
|
||||||
local_storage.resolve_in_storage(&storage.remote_object_id(&local_path)?)?;
|
local_storage.resolve_in_storage(&storage.remote_object_id(local_path)?)?;
|
||||||
let remote_parent_dir = remote_path.parent().unwrap();
|
let remote_parent_dir = remote_path.parent().unwrap();
|
||||||
if !remote_parent_dir.exists() {
|
if !remote_parent_dir.exists() {
|
||||||
fs::create_dir_all(&remote_parent_dir).await?;
|
fs::create_dir_all(&remote_parent_dir).await?;
|
||||||
@@ -473,11 +528,19 @@ mod tests {
|
|||||||
|
|
||||||
let mut remote_timeline = RemoteTimeline::new(metadata.clone());
|
let mut remote_timeline = RemoteTimeline::new(metadata.clone());
|
||||||
remote_timeline.awaits_download = true;
|
remote_timeline.awaits_download = true;
|
||||||
remote_timeline.add_timeline_layers(
|
remote_timeline.add_timeline_layers(layer_files.iter().map(|layer| {
|
||||||
layer_files
|
let layer_path = local_timeline_path.join(layer);
|
||||||
.iter()
|
|
||||||
.map(|layer| local_timeline_path.join(layer)),
|
// this could had also been LayerFileMetadata::default(), but since in this test we
|
||||||
);
|
// don't do the merge operation done by storage_sync::download_timeline_data, it would
|
||||||
|
// not be merged back to timeline.
|
||||||
|
let metadata_from_upload = timeline_upload
|
||||||
|
.layers_to_upload
|
||||||
|
.get(&layer_path)
|
||||||
|
.expect("layer must exist in previously uploaded paths")
|
||||||
|
.to_owned();
|
||||||
|
(layer_path, metadata_from_upload)
|
||||||
|
}));
|
||||||
|
|
||||||
let download_data = match download_timeline_layers(
|
let download_data = match download_timeline_layers(
|
||||||
harness.conf,
|
harness.conf,
|
||||||
@@ -487,9 +550,9 @@ mod tests {
|
|||||||
sync_id,
|
sync_id,
|
||||||
SyncData::new(
|
SyncData::new(
|
||||||
current_retries,
|
current_retries,
|
||||||
LayersDownload {
|
LayersDownload::from_skipped_layers(HashSet::from([
|
||||||
layers_to_skip: HashSet::from([local_timeline_path.join("layer_to_skip")]),
|
local_timeline_path.join("layer_to_skip")
|
||||||
},
|
])),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
@@ -552,12 +615,7 @@ mod tests {
|
|||||||
&sync_queue,
|
&sync_queue,
|
||||||
None,
|
None,
|
||||||
sync_id,
|
sync_id,
|
||||||
SyncData::new(
|
SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())),
|
||||||
0,
|
|
||||||
LayersDownload {
|
|
||||||
layers_to_skip: HashSet::new(),
|
|
||||||
},
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
assert!(
|
assert!(
|
||||||
@@ -576,12 +634,7 @@ mod tests {
|
|||||||
&sync_queue,
|
&sync_queue,
|
||||||
Some(¬_expecting_download_remote_timeline),
|
Some(¬_expecting_download_remote_timeline),
|
||||||
sync_id,
|
sync_id,
|
||||||
SyncData::new(
|
SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())),
|
||||||
0,
|
|
||||||
LayersDownload {
|
|
||||||
layers_to_skip: HashSet::new(),
|
|
||||||
},
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
assert!(
|
assert!(
|
||||||
|
|||||||
@@ -212,8 +212,8 @@ impl RemoteTimelineIndex {
|
|||||||
/// Restored index part data about the timeline, stored in the remote index.
|
/// Restored index part data about the timeline, stored in the remote index.
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct RemoteTimeline {
|
pub struct RemoteTimeline {
|
||||||
timeline_layers: HashSet<PathBuf>,
|
timeline_layers: HashMap<PathBuf, LayerFileMetadata>,
|
||||||
missing_layers: HashSet<PathBuf>,
|
missing_layers: HashMap<PathBuf, LayerFileMetadata>,
|
||||||
|
|
||||||
pub metadata: TimelineMetadata,
|
pub metadata: TimelineMetadata,
|
||||||
pub awaits_download: bool,
|
pub awaits_download: bool,
|
||||||
@@ -222,62 +222,161 @@ pub struct RemoteTimeline {
|
|||||||
impl RemoteTimeline {
|
impl RemoteTimeline {
|
||||||
pub fn new(metadata: TimelineMetadata) -> Self {
|
pub fn new(metadata: TimelineMetadata) -> Self {
|
||||||
Self {
|
Self {
|
||||||
timeline_layers: HashSet::new(),
|
timeline_layers: HashMap::default(),
|
||||||
missing_layers: HashSet::new(),
|
missing_layers: HashMap::default(),
|
||||||
metadata,
|
metadata,
|
||||||
awaits_download: false,
|
awaits_download: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_timeline_layers(&mut self, new_layers: impl IntoIterator<Item = PathBuf>) {
|
pub fn add_timeline_layers(
|
||||||
self.timeline_layers.extend(new_layers.into_iter());
|
&mut self,
|
||||||
|
new_layers: impl IntoIterator<Item = (PathBuf, LayerFileMetadata)>,
|
||||||
|
) {
|
||||||
|
self.timeline_layers.extend(new_layers);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_upload_failures(&mut self, upload_failures: impl IntoIterator<Item = PathBuf>) {
|
pub fn add_upload_failures(
|
||||||
self.missing_layers.extend(upload_failures.into_iter());
|
&mut self,
|
||||||
|
upload_failures: impl IntoIterator<Item = (PathBuf, LayerFileMetadata)>,
|
||||||
|
) {
|
||||||
|
self.missing_layers.extend(upload_failures);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remove_layers(&mut self, layers_to_remove: &HashSet<PathBuf>) {
|
pub fn remove_layers(&mut self, layers_to_remove: &HashSet<PathBuf>) {
|
||||||
self.timeline_layers
|
self.timeline_layers
|
||||||
.retain(|layer| !layers_to_remove.contains(layer));
|
.retain(|layer, _| !layers_to_remove.contains(layer));
|
||||||
self.missing_layers
|
self.missing_layers
|
||||||
.retain(|layer| !layers_to_remove.contains(layer));
|
.retain(|layer, _| !layers_to_remove.contains(layer));
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Lists all layer files in the given remote timeline. Omits the metadata file.
|
/// Lists all layer files in the given remote timeline. Omits the metadata file.
|
||||||
pub fn stored_files(&self) -> &HashSet<PathBuf> {
|
pub fn stored_files(&self) -> &HashMap<PathBuf, LayerFileMetadata> {
|
||||||
&self.timeline_layers
|
&self.timeline_layers
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Combines metadata gathered or verified during downloading needed layer files to metadata on
|
||||||
|
/// the [`RemoteIndex`], so it can be uploaded later.
|
||||||
|
pub fn merge_metadata_from_downloaded(
|
||||||
|
&mut self,
|
||||||
|
downloaded: &HashMap<PathBuf, LayerFileMetadata>,
|
||||||
|
) {
|
||||||
|
downloaded.iter().for_each(|(path, metadata)| {
|
||||||
|
if let Some(upgraded) = self.timeline_layers.get_mut(path) {
|
||||||
|
upgraded.merge(metadata);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
pub fn from_index_part(timeline_path: &Path, index_part: IndexPart) -> anyhow::Result<Self> {
|
pub fn from_index_part(timeline_path: &Path, index_part: IndexPart) -> anyhow::Result<Self> {
|
||||||
let metadata = TimelineMetadata::from_bytes(&index_part.metadata_bytes)?;
|
let metadata = TimelineMetadata::from_bytes(&index_part.metadata_bytes)?;
|
||||||
|
let default_metadata = &IndexLayerMetadata::default();
|
||||||
|
|
||||||
|
let find_metadata = |key: &RelativePath| -> LayerFileMetadata {
|
||||||
|
index_part
|
||||||
|
.layer_metadata
|
||||||
|
.get(key)
|
||||||
|
.unwrap_or(default_metadata)
|
||||||
|
.into()
|
||||||
|
};
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
timeline_layers: to_local_paths(timeline_path, index_part.timeline_layers),
|
timeline_layers: index_part
|
||||||
missing_layers: to_local_paths(timeline_path, index_part.missing_layers),
|
.timeline_layers
|
||||||
|
.iter()
|
||||||
|
.map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path)))
|
||||||
|
.collect(),
|
||||||
|
missing_layers: index_part
|
||||||
|
.missing_layers
|
||||||
|
.iter()
|
||||||
|
.map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path)))
|
||||||
|
.collect(),
|
||||||
metadata,
|
metadata,
|
||||||
awaits_download: false,
|
awaits_download: false,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Metadata gathered for each of the layer files.
|
||||||
|
///
|
||||||
|
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||||
|
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
|
#[cfg_attr(test, derive(Default))]
|
||||||
|
pub struct LayerFileMetadata {
|
||||||
|
file_size: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
||||||
|
fn from(other: &IndexLayerMetadata) -> Self {
|
||||||
|
LayerFileMetadata {
|
||||||
|
file_size: other.file_size,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LayerFileMetadata {
|
||||||
|
pub fn new(file_size: u64) -> Self {
|
||||||
|
LayerFileMetadata {
|
||||||
|
file_size: Some(file_size),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn file_size(&self) -> Option<u64> {
|
||||||
|
self.file_size
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metadata has holes due to version upgrades. This method is called to upgrade self with the
|
||||||
|
/// other value.
|
||||||
|
///
|
||||||
|
/// This is called on the possibly outdated version.
|
||||||
|
pub fn merge(&mut self, other: &Self) {
|
||||||
|
self.file_size = other.file_size.or(self.file_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Part of the remote index, corresponding to a certain timeline.
|
/// Part of the remote index, corresponding to a certain timeline.
|
||||||
/// Contains the data about all files in the timeline, present remotely and its metadata.
|
/// Contains the data about all files in the timeline, present remotely and its metadata.
|
||||||
|
///
|
||||||
|
/// This type needs to be backwards and forwards compatible. When changing the fields,
|
||||||
|
/// remember to add a test case for the changed version.
|
||||||
#[serde_as]
|
#[serde_as]
|
||||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||||
pub struct IndexPart {
|
pub struct IndexPart {
|
||||||
|
/// Debugging aid describing the version of this type.
|
||||||
|
#[serde(default)]
|
||||||
|
version: usize,
|
||||||
|
|
||||||
|
/// Each of the layers present on remote storage.
|
||||||
|
///
|
||||||
|
/// Additional metadata can might exist in `layer_metadata`.
|
||||||
timeline_layers: HashSet<RelativePath>,
|
timeline_layers: HashSet<RelativePath>,
|
||||||
|
|
||||||
/// Currently is not really used in pageserver,
|
/// Currently is not really used in pageserver,
|
||||||
/// present to manually keep track of the layer files that pageserver might never retrieve.
|
/// present to manually keep track of the layer files that pageserver might never retrieve.
|
||||||
///
|
///
|
||||||
/// Such "holes" might appear if any upload task was evicted on an error threshold:
|
/// Such "holes" might appear if any upload task was evicted on an error threshold:
|
||||||
/// the this layer will only be rescheduled for upload on pageserver restart.
|
/// the this layer will only be rescheduled for upload on pageserver restart.
|
||||||
missing_layers: HashSet<RelativePath>,
|
missing_layers: HashSet<RelativePath>,
|
||||||
|
|
||||||
|
/// Per layer file metadata, which can be present for a present or missing layer file.
|
||||||
|
///
|
||||||
|
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||||
|
/// that latest version stores.
|
||||||
|
#[serde(default)]
|
||||||
|
layer_metadata: HashMap<RelativePath, IndexLayerMetadata>,
|
||||||
|
|
||||||
#[serde_as(as = "DisplayFromStr")]
|
#[serde_as(as = "DisplayFromStr")]
|
||||||
disk_consistent_lsn: Lsn,
|
disk_consistent_lsn: Lsn,
|
||||||
metadata_bytes: Vec<u8>,
|
metadata_bytes: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IndexPart {
|
impl IndexPart {
|
||||||
|
/// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
|
||||||
|
/// used to understand later versions.
|
||||||
|
///
|
||||||
|
/// Version is currently informative only.
|
||||||
|
const LATEST_VERSION: usize = 1;
|
||||||
pub const FILE_NAME: &'static str = "index_part.json";
|
pub const FILE_NAME: &'static str = "index_part.json";
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -288,8 +387,10 @@ impl IndexPart {
|
|||||||
metadata_bytes: Vec<u8>,
|
metadata_bytes: Vec<u8>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
version: Self::LATEST_VERSION,
|
||||||
timeline_layers,
|
timeline_layers,
|
||||||
missing_layers,
|
missing_layers,
|
||||||
|
layer_metadata: HashMap::default(),
|
||||||
disk_consistent_lsn,
|
disk_consistent_lsn,
|
||||||
metadata_bytes,
|
metadata_bytes,
|
||||||
}
|
}
|
||||||
@@ -304,35 +405,68 @@ impl IndexPart {
|
|||||||
remote_timeline: RemoteTimeline,
|
remote_timeline: RemoteTimeline,
|
||||||
) -> anyhow::Result<Self> {
|
) -> anyhow::Result<Self> {
|
||||||
let metadata_bytes = remote_timeline.metadata.to_bytes()?;
|
let metadata_bytes = remote_timeline.metadata.to_bytes()?;
|
||||||
|
|
||||||
|
let mut layer_metadata = HashMap::new();
|
||||||
|
|
||||||
|
let mut missing_layers = HashSet::new();
|
||||||
|
|
||||||
|
separate_paths_and_metadata(
|
||||||
|
timeline_path,
|
||||||
|
&remote_timeline.missing_layers,
|
||||||
|
&mut missing_layers,
|
||||||
|
&mut layer_metadata,
|
||||||
|
)
|
||||||
|
.context("Failed to convert missing layers' paths to relative ones")?;
|
||||||
|
|
||||||
|
let mut timeline_layers = HashSet::new();
|
||||||
|
|
||||||
|
separate_paths_and_metadata(
|
||||||
|
timeline_path,
|
||||||
|
&remote_timeline.timeline_layers,
|
||||||
|
&mut timeline_layers,
|
||||||
|
&mut layer_metadata,
|
||||||
|
)
|
||||||
|
.context("Failed to convert timeline layers' paths to relative ones")?;
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
timeline_layers: to_relative_paths(timeline_path, remote_timeline.timeline_layers)
|
version: Self::LATEST_VERSION,
|
||||||
.context("Failed to convert timeline layers' paths to relative ones")?,
|
timeline_layers,
|
||||||
missing_layers: to_relative_paths(timeline_path, remote_timeline.missing_layers)
|
missing_layers,
|
||||||
.context("Failed to convert missing layers' paths to relative ones")?,
|
layer_metadata,
|
||||||
disk_consistent_lsn: remote_timeline.metadata.disk_consistent_lsn(),
|
disk_consistent_lsn: remote_timeline.metadata.disk_consistent_lsn(),
|
||||||
metadata_bytes,
|
metadata_bytes,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn to_local_paths(
|
/// Serialized form of [`LayerFileMetadata`].
|
||||||
timeline_path: &Path,
|
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
|
||||||
paths: impl IntoIterator<Item = RelativePath>,
|
pub struct IndexLayerMetadata {
|
||||||
) -> HashSet<PathBuf> {
|
file_size: Option<u64>,
|
||||||
paths
|
|
||||||
.into_iter()
|
|
||||||
.map(|path| path.as_path(timeline_path))
|
|
||||||
.collect()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn to_relative_paths(
|
impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
|
||||||
|
fn from(other: &'_ LayerFileMetadata) -> Self {
|
||||||
|
IndexLayerMetadata {
|
||||||
|
file_size: other.file_size,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn separate_paths_and_metadata(
|
||||||
timeline_path: &Path,
|
timeline_path: &Path,
|
||||||
paths: impl IntoIterator<Item = PathBuf>,
|
input: &HashMap<PathBuf, LayerFileMetadata>,
|
||||||
) -> anyhow::Result<HashSet<RelativePath>> {
|
output: &mut HashSet<RelativePath>,
|
||||||
paths
|
layer_metadata: &mut HashMap<RelativePath, IndexLayerMetadata>,
|
||||||
.into_iter()
|
) -> anyhow::Result<()> {
|
||||||
.map(|path| RelativePath::new(timeline_path, path))
|
for (path, metadata) in input {
|
||||||
.collect()
|
let rel_path = RelativePath::new(timeline_path, path)?;
|
||||||
|
let metadata = IndexLayerMetadata::from(metadata);
|
||||||
|
|
||||||
|
layer_metadata.insert(rel_path.clone(), metadata);
|
||||||
|
output.insert(rel_path);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -357,13 +491,13 @@ mod tests {
|
|||||||
DEFAULT_PG_VERSION,
|
DEFAULT_PG_VERSION,
|
||||||
);
|
);
|
||||||
let remote_timeline = RemoteTimeline {
|
let remote_timeline = RemoteTimeline {
|
||||||
timeline_layers: HashSet::from([
|
timeline_layers: HashMap::from([
|
||||||
timeline_path.join("layer_1"),
|
(timeline_path.join("layer_1"), LayerFileMetadata::new(1)),
|
||||||
timeline_path.join("layer_2"),
|
(timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
|
||||||
]),
|
]),
|
||||||
missing_layers: HashSet::from([
|
missing_layers: HashMap::from([
|
||||||
timeline_path.join("missing_1"),
|
(timeline_path.join("missing_1"), LayerFileMetadata::new(3)),
|
||||||
timeline_path.join("missing_2"),
|
(timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
|
||||||
]),
|
]),
|
||||||
metadata: metadata.clone(),
|
metadata: metadata.clone(),
|
||||||
awaits_download: false,
|
awaits_download: false,
|
||||||
@@ -485,13 +619,13 @@ mod tests {
|
|||||||
let conversion_result = IndexPart::from_remote_timeline(
|
let conversion_result = IndexPart::from_remote_timeline(
|
||||||
&timeline_path,
|
&timeline_path,
|
||||||
RemoteTimeline {
|
RemoteTimeline {
|
||||||
timeline_layers: HashSet::from([
|
timeline_layers: HashMap::from([
|
||||||
PathBuf::from("bad_path"),
|
(PathBuf::from("bad_path"), LayerFileMetadata::new(1)),
|
||||||
timeline_path.join("layer_2"),
|
(timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
|
||||||
]),
|
]),
|
||||||
missing_layers: HashSet::from([
|
missing_layers: HashMap::from([
|
||||||
timeline_path.join("missing_1"),
|
(timeline_path.join("missing_1"), LayerFileMetadata::new(3)),
|
||||||
timeline_path.join("missing_2"),
|
(timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
|
||||||
]),
|
]),
|
||||||
metadata: metadata.clone(),
|
metadata: metadata.clone(),
|
||||||
awaits_download: false,
|
awaits_download: false,
|
||||||
@@ -502,13 +636,13 @@ mod tests {
|
|||||||
let conversion_result = IndexPart::from_remote_timeline(
|
let conversion_result = IndexPart::from_remote_timeline(
|
||||||
&timeline_path,
|
&timeline_path,
|
||||||
RemoteTimeline {
|
RemoteTimeline {
|
||||||
timeline_layers: HashSet::from([
|
timeline_layers: HashMap::from([
|
||||||
timeline_path.join("layer_1"),
|
(timeline_path.join("layer_1"), LayerFileMetadata::new(1)),
|
||||||
timeline_path.join("layer_2"),
|
(timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
|
||||||
]),
|
]),
|
||||||
missing_layers: HashSet::from([
|
missing_layers: HashMap::from([
|
||||||
PathBuf::from("bad_path"),
|
(PathBuf::from("bad_path"), LayerFileMetadata::new(3)),
|
||||||
timeline_path.join("missing_2"),
|
(timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
|
||||||
]),
|
]),
|
||||||
metadata,
|
metadata,
|
||||||
awaits_download: false,
|
awaits_download: false,
|
||||||
@@ -516,4 +650,63 @@ mod tests {
|
|||||||
);
|
);
|
||||||
assert!(conversion_result.is_err(), "Should not be able to convert metadata with missing layer paths that are not in the timeline directory");
|
assert!(conversion_result.is_err(), "Should not be able to convert metadata with missing layer paths that are not in the timeline directory");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn v0_indexpart_is_parsed() {
|
||||||
|
let example = r#"{
|
||||||
|
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||||
|
"missing_layers":["not_a_real_layer_but_adding_coverage"],
|
||||||
|
"disk_consistent_lsn":"0/16960E8",
|
||||||
|
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||||
|
}"#;
|
||||||
|
|
||||||
|
let expected = IndexPart {
|
||||||
|
version: 0,
|
||||||
|
timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
|
||||||
|
missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
|
||||||
|
layer_metadata: HashMap::default(),
|
||||||
|
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||||
|
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||||
|
assert_eq!(part, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn v1_indexpart_is_parsed() {
|
||||||
|
let example = r#"{
|
||||||
|
"version":1,
|
||||||
|
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||||
|
"missing_layers":["not_a_real_layer_but_adding_coverage"],
|
||||||
|
"layer_metadata":{
|
||||||
|
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||||
|
"not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
|
||||||
|
},
|
||||||
|
"disk_consistent_lsn":"0/16960E8",
|
||||||
|
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||||
|
}"#;
|
||||||
|
|
||||||
|
let expected = IndexPart {
|
||||||
|
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||||
|
version: 1,
|
||||||
|
timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
|
||||||
|
missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
|
||||||
|
layer_metadata: HashMap::from([
|
||||||
|
(RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
|
||||||
|
file_size: Some(25600000),
|
||||||
|
}),
|
||||||
|
(RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata {
|
||||||
|
// serde_json should always parse this but this might be a double with jq for
|
||||||
|
// example.
|
||||||
|
file_size: Some(9007199254741001),
|
||||||
|
})
|
||||||
|
]),
|
||||||
|
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||||
|
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||||
|
assert_eq!(part, expected);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -69,14 +69,25 @@ pub(super) async fn upload_timeline_layers<'a>(
|
|||||||
.map(|meta| meta.disk_consistent_lsn());
|
.map(|meta| meta.disk_consistent_lsn());
|
||||||
|
|
||||||
let already_uploaded_layers = remote_timeline
|
let already_uploaded_layers = remote_timeline
|
||||||
.map(|timeline| timeline.stored_files())
|
.map(|timeline| {
|
||||||
.cloned()
|
timeline
|
||||||
|
.stored_files()
|
||||||
|
.keys()
|
||||||
|
.cloned()
|
||||||
|
.collect::<std::collections::HashSet<_>>()
|
||||||
|
})
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|
||||||
let layers_to_upload = upload
|
let layers_to_upload = upload
|
||||||
.layers_to_upload
|
.layers_to_upload
|
||||||
.difference(&already_uploaded_layers)
|
.iter()
|
||||||
.cloned()
|
.filter_map(|(k, v)| {
|
||||||
|
if !already_uploaded_layers.contains(k) {
|
||||||
|
Some((k.to_owned(), v.to_owned()))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
if layers_to_upload.is_empty() {
|
if layers_to_upload.is_empty() {
|
||||||
@@ -98,7 +109,7 @@ pub(super) async fn upload_timeline_layers<'a>(
|
|||||||
|
|
||||||
let mut upload_tasks = layers_to_upload
|
let mut upload_tasks = layers_to_upload
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|source_path| async move {
|
.map(|(source_path, known_metadata)| async move {
|
||||||
let source_file = match fs::File::open(&source_path).await.with_context(|| {
|
let source_file = match fs::File::open(&source_path).await.with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
"Failed to upen a source file for layer '{}'",
|
"Failed to upen a source file for layer '{}'",
|
||||||
@@ -109,7 +120,7 @@ pub(super) async fn upload_timeline_layers<'a>(
|
|||||||
Err(e) => return Err(UploadError::MissingLocalFile(source_path, e)),
|
Err(e) => return Err(UploadError::MissingLocalFile(source_path, e)),
|
||||||
};
|
};
|
||||||
|
|
||||||
let source_size = source_file
|
let fs_size = source_file
|
||||||
.metadata()
|
.metadata()
|
||||||
.await
|
.await
|
||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
@@ -119,10 +130,24 @@ pub(super) async fn upload_timeline_layers<'a>(
|
|||||||
)
|
)
|
||||||
})
|
})
|
||||||
.map_err(UploadError::Other)?
|
.map_err(UploadError::Other)?
|
||||||
.len() as usize;
|
.len();
|
||||||
|
|
||||||
|
// FIXME: this looks bad
|
||||||
|
if let Some(metadata_size) = known_metadata.file_size() {
|
||||||
|
if metadata_size != fs_size {
|
||||||
|
return Err(UploadError::Other(anyhow::anyhow!(
|
||||||
|
"File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// this is a silly state we would like to avoid
|
||||||
|
}
|
||||||
|
|
||||||
|
let fs_size = usize::try_from(fs_size).with_context(|| format!("File {source_path:?} size {fs_size} could not be converted to usize"))
|
||||||
|
.map_err(UploadError::Other)?;
|
||||||
|
|
||||||
match storage
|
match storage
|
||||||
.upload_storage_object(Box::new(source_file), source_size, &source_path)
|
.upload_storage_object(Box::new(source_file), fs_size, &source_path)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("Failed to upload layer file for {sync_id}"))
|
.with_context(|| format!("Failed to upload layer file for {sync_id}"))
|
||||||
{
|
{
|
||||||
@@ -136,8 +161,11 @@ pub(super) async fn upload_timeline_layers<'a>(
|
|||||||
while let Some(upload_result) = upload_tasks.next().await {
|
while let Some(upload_result) = upload_tasks.next().await {
|
||||||
match upload_result {
|
match upload_result {
|
||||||
Ok(uploaded_path) => {
|
Ok(uploaded_path) => {
|
||||||
upload.layers_to_upload.remove(&uploaded_path);
|
let metadata = upload
|
||||||
upload.uploaded_layers.insert(uploaded_path);
|
.layers_to_upload
|
||||||
|
.remove(&uploaded_path)
|
||||||
|
.expect("metadata should always exist, assuming no double uploads");
|
||||||
|
upload.uploaded_layers.insert(uploaded_path, metadata);
|
||||||
}
|
}
|
||||||
Err(e) => match e {
|
Err(e) => match e {
|
||||||
UploadError::Other(e) => {
|
UploadError::Other(e) => {
|
||||||
@@ -262,7 +290,7 @@ mod tests {
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
upload
|
upload
|
||||||
.uploaded_layers
|
.uploaded_layers
|
||||||
.iter()
|
.keys()
|
||||||
.cloned()
|
.cloned()
|
||||||
.collect::<BTreeSet<_>>(),
|
.collect::<BTreeSet<_>>(),
|
||||||
layer_files
|
layer_files
|
||||||
@@ -357,7 +385,7 @@ mod tests {
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
upload
|
upload
|
||||||
.uploaded_layers
|
.uploaded_layers
|
||||||
.iter()
|
.keys()
|
||||||
.cloned()
|
.cloned()
|
||||||
.collect::<BTreeSet<_>>(),
|
.collect::<BTreeSet<_>>(),
|
||||||
layer_files
|
layer_files
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ use crate::tenant_config::TenantConfOpt;
|
|||||||
use crate::virtual_file::VirtualFile;
|
use crate::virtual_file::VirtualFile;
|
||||||
use crate::walredo::WalRedoManager;
|
use crate::walredo::WalRedoManager;
|
||||||
use crate::{CheckpointConfig, TEMP_FILE_SUFFIX};
|
use crate::{CheckpointConfig, TEMP_FILE_SUFFIX};
|
||||||
|
pub use pageserver_api::models::TenantState;
|
||||||
|
|
||||||
use toml_edit;
|
use toml_edit;
|
||||||
use utils::{
|
use utils::{
|
||||||
@@ -58,13 +59,14 @@ pub mod block_io;
|
|||||||
mod delta_layer;
|
mod delta_layer;
|
||||||
mod disk_btree;
|
mod disk_btree;
|
||||||
pub(crate) mod ephemeral_file;
|
pub(crate) mod ephemeral_file;
|
||||||
mod filename;
|
pub mod filename;
|
||||||
mod image_layer;
|
mod image_layer;
|
||||||
mod inmemory_layer;
|
mod inmemory_layer;
|
||||||
mod layer_map;
|
pub mod layer_map;
|
||||||
|
|
||||||
pub mod metadata;
|
pub mod metadata;
|
||||||
mod par_fsync;
|
mod par_fsync;
|
||||||
mod storage_layer;
|
pub mod storage_layer;
|
||||||
|
|
||||||
mod timeline;
|
mod timeline;
|
||||||
|
|
||||||
@@ -118,18 +120,6 @@ pub struct Tenant {
|
|||||||
upload_layers: bool,
|
upload_layers: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A state of a tenant in pageserver's memory.
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
|
||||||
pub enum TenantState {
|
|
||||||
/// Tenant is fully operational, its background jobs might be running or not.
|
|
||||||
Active { background_jobs_running: bool },
|
|
||||||
/// A tenant is recognized by pageserver, but not yet ready to operate:
|
|
||||||
/// e.g. not present locally and being downloaded or being read into memory from the file system.
|
|
||||||
Paused,
|
|
||||||
/// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated.
|
|
||||||
Broken,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A repository corresponds to one .neon directory. One repository holds multiple
|
/// A repository corresponds to one .neon directory. One repository holds multiple
|
||||||
/// timelines, forked off from the same initial call to 'initdb'.
|
/// timelines, forked off from the same initial call to 'initdb'.
|
||||||
impl Tenant {
|
impl Tenant {
|
||||||
@@ -155,17 +145,18 @@ impl Tenant {
|
|||||||
|
|
||||||
/// Lists timelines the tenant contains.
|
/// Lists timelines the tenant contains.
|
||||||
/// Up to tenant's implementation to omit certain timelines that ar not considered ready for use.
|
/// Up to tenant's implementation to omit certain timelines that ar not considered ready for use.
|
||||||
pub fn list_timelines(&self) -> Vec<(TimelineId, Arc<Timeline>)> {
|
pub fn list_timelines(&self) -> Vec<Arc<Timeline>> {
|
||||||
self.timelines
|
self.timelines
|
||||||
.lock()
|
.lock()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.iter()
|
.values()
|
||||||
.map(|(timeline_id, timeline_entry)| (*timeline_id, Arc::clone(timeline_entry)))
|
.map(Arc::clone)
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a new, empty timeline. The caller is responsible for loading data into it
|
/// This is used to create the initial 'main' timeline during bootstrapping,
|
||||||
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
|
/// or when importing a new base backup. The caller is expected to load an
|
||||||
|
/// initial image of the datadir to the new timeline after this.
|
||||||
pub fn create_empty_timeline(
|
pub fn create_empty_timeline(
|
||||||
&self,
|
&self,
|
||||||
new_timeline_id: TimelineId,
|
new_timeline_id: TimelineId,
|
||||||
@@ -310,7 +301,7 @@ impl Tenant {
|
|||||||
|
|
||||||
for (timeline_id, timeline) in &timelines_to_compact {
|
for (timeline_id, timeline) in &timelines_to_compact {
|
||||||
let _entered = info_span!("compact_timeline", timeline = %timeline_id).entered();
|
let _entered = info_span!("compact_timeline", timeline = %timeline_id).entered();
|
||||||
timeline.reconstruct()?;
|
timeline.compact()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -356,7 +347,7 @@ impl Tenant {
|
|||||||
|
|
||||||
ensure!(
|
ensure!(
|
||||||
!children_exist,
|
!children_exist,
|
||||||
"Cannot detach timeline which has child timelines"
|
"Cannot delete timeline which has child timelines"
|
||||||
);
|
);
|
||||||
let timeline_entry = match timelines.entry(timeline_id) {
|
let timeline_entry = match timelines.entry(timeline_id) {
|
||||||
Entry::Occupied(e) => e,
|
Entry::Occupied(e) => e,
|
||||||
@@ -917,6 +908,7 @@ impl Tenant {
|
|||||||
Ok(totals)
|
Ok(totals)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Branch an existing timeline
|
||||||
fn branch_timeline(
|
fn branch_timeline(
|
||||||
&self,
|
&self,
|
||||||
src: TimelineId,
|
src: TimelineId,
|
||||||
@@ -992,7 +984,7 @@ impl Tenant {
|
|||||||
dst_prev,
|
dst_prev,
|
||||||
Some(src),
|
Some(src),
|
||||||
start_lsn,
|
start_lsn,
|
||||||
*src_timeline.latest_gc_cutoff_lsn.read(),
|
*src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
|
||||||
src_timeline.initdb_lsn,
|
src_timeline.initdb_lsn,
|
||||||
src_timeline.pg_version,
|
src_timeline.pg_version,
|
||||||
);
|
);
|
||||||
@@ -1105,12 +1097,22 @@ impl Tenant {
|
|||||||
|
|
||||||
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||||
/// to get bootstrap data for timeline initialization.
|
/// to get bootstrap data for timeline initialization.
|
||||||
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32) -> Result<()> {
|
fn run_initdb(
|
||||||
info!("running initdb in {}... ", initdbpath.display());
|
conf: &'static PageServerConf,
|
||||||
|
initdb_target_dir: &Path,
|
||||||
|
pg_version: u32,
|
||||||
|
) -> Result<()> {
|
||||||
|
let initdb_bin_path = conf.pg_bin_dir(pg_version).join("initdb");
|
||||||
|
let initdb_lib_dir = conf.pg_lib_dir(pg_version);
|
||||||
|
info!(
|
||||||
|
"running {} in {}, libdir: {}",
|
||||||
|
initdb_bin_path.display(),
|
||||||
|
initdb_target_dir.display(),
|
||||||
|
initdb_lib_dir.display(),
|
||||||
|
);
|
||||||
|
|
||||||
let initdb_path = conf.pg_bin_dir(pg_version).join("initdb");
|
let initdb_output = Command::new(initdb_bin_path)
|
||||||
let initdb_output = Command::new(initdb_path)
|
.args(&["-D", &initdb_target_dir.to_string_lossy()])
|
||||||
.args(&["-D", &initdbpath.to_string_lossy()])
|
|
||||||
.args(&["-U", &conf.superuser])
|
.args(&["-U", &conf.superuser])
|
||||||
.args(&["-E", "utf8"])
|
.args(&["-E", "utf8"])
|
||||||
.arg("--no-instructions")
|
.arg("--no-instructions")
|
||||||
@@ -1118,8 +1120,8 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32)
|
|||||||
// so no need to fsync it
|
// so no need to fsync it
|
||||||
.arg("--no-sync")
|
.arg("--no-sync")
|
||||||
.env_clear()
|
.env_clear()
|
||||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
|
.env("LD_LIBRARY_PATH", &initdb_lib_dir)
|
||||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
|
.env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
|
||||||
.stdout(Stdio::null())
|
.stdout(Stdio::null())
|
||||||
.output()
|
.output()
|
||||||
.context("failed to execute initdb")?;
|
.context("failed to execute initdb")?;
|
||||||
@@ -1741,7 +1743,7 @@ mod tests {
|
|||||||
drop(writer);
|
drop(writer);
|
||||||
|
|
||||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||||
tline.reconstruct()?;
|
tline.compact()?;
|
||||||
|
|
||||||
let writer = tline.writer();
|
let writer = tline.writer();
|
||||||
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||||
@@ -1749,7 +1751,7 @@ mod tests {
|
|||||||
drop(writer);
|
drop(writer);
|
||||||
|
|
||||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||||
tline.reconstruct()?;
|
tline.compact()?;
|
||||||
|
|
||||||
let writer = tline.writer();
|
let writer = tline.writer();
|
||||||
writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
|
writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
|
||||||
@@ -1757,7 +1759,7 @@ mod tests {
|
|||||||
drop(writer);
|
drop(writer);
|
||||||
|
|
||||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||||
tline.reconstruct()?;
|
tline.compact()?;
|
||||||
|
|
||||||
let writer = tline.writer();
|
let writer = tline.writer();
|
||||||
writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
|
writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
|
||||||
@@ -1765,7 +1767,7 @@ mod tests {
|
|||||||
drop(writer);
|
drop(writer);
|
||||||
|
|
||||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||||
tline.reconstruct()?;
|
tline.compact()?;
|
||||||
|
|
||||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
|
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
|
||||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
|
assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
|
||||||
@@ -1813,7 +1815,7 @@ mod tests {
|
|||||||
|
|
||||||
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
||||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||||
tline.reconstruct()?;
|
tline.compact()?;
|
||||||
tline.gc()?;
|
tline.gc()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1883,7 +1885,7 @@ mod tests {
|
|||||||
let cutoff = tline.get_last_record_lsn();
|
let cutoff = tline.get_last_record_lsn();
|
||||||
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
||||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||||
tline.reconstruct()?;
|
tline.compact()?;
|
||||||
tline.gc()?;
|
tline.gc()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1962,7 +1964,7 @@ mod tests {
|
|||||||
let cutoff = tline.get_last_record_lsn();
|
let cutoff = tline.get_last_record_lsn();
|
||||||
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
||||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||||
tline.reconstruct()?;
|
tline.compact()?;
|
||||||
tline.gc()?;
|
tline.gc()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -95,9 +95,6 @@ impl From<&DeltaLayer> for Summary {
|
|||||||
// Flag indicating that this version initialize the page
|
// Flag indicating that this version initialize the page
|
||||||
const WILL_INIT: u64 = 1;
|
const WILL_INIT: u64 = 1;
|
||||||
|
|
||||||
// Flag indicating page image
|
|
||||||
const IS_IMAGE: u64 = 2;
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Struct representing reference to BLOB in layers. Reference contains BLOB
|
/// Struct representing reference to BLOB in layers. Reference contains BLOB
|
||||||
/// offset, and for WAL records it also contains `will_init` flag. The flag
|
/// offset, and for WAL records it also contains `will_init` flag. The flag
|
||||||
@@ -112,22 +109,15 @@ impl BlobRef {
|
|||||||
(self.0 & WILL_INIT) != 0
|
(self.0 & WILL_INIT) != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_image(&self) -> bool {
|
|
||||||
(self.0 & IS_IMAGE) != 0
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn pos(&self) -> u64 {
|
pub fn pos(&self) -> u64 {
|
||||||
self.0 >> 2
|
self.0 >> 1
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new(pos: u64, will_init: bool, is_image: bool) -> BlobRef {
|
pub fn new(pos: u64, will_init: bool) -> BlobRef {
|
||||||
let mut blob_ref = pos << 2;
|
let mut blob_ref = pos << 1;
|
||||||
if will_init {
|
if will_init {
|
||||||
blob_ref |= WILL_INIT;
|
blob_ref |= WILL_INIT;
|
||||||
}
|
}
|
||||||
if is_image {
|
|
||||||
blob_ref |= IS_IMAGE;
|
|
||||||
}
|
|
||||||
BlobRef(blob_ref)
|
BlobRef(blob_ref)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -324,13 +314,13 @@ impl Layer for DeltaLayer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn key_iter<'a>(&'a self, skip_images: bool) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'a> {
|
fn key_iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'a> {
|
||||||
let inner = match self.load() {
|
let inner = match self.load() {
|
||||||
Ok(inner) => inner,
|
Ok(inner) => inner,
|
||||||
Err(e) => panic!("Failed to load a delta layer: {e:?}"),
|
Err(e) => panic!("Failed to load a delta layer: {e:?}"),
|
||||||
};
|
};
|
||||||
|
|
||||||
match DeltaKeyIter::new(inner, skip_images) {
|
match DeltaKeyIter::new(inner) {
|
||||||
Ok(iter) => Box::new(iter),
|
Ok(iter) => Box::new(iter),
|
||||||
Err(e) => panic!("Layer index is corrupted: {e:?}"),
|
Err(e) => panic!("Layer index is corrupted: {e:?}"),
|
||||||
}
|
}
|
||||||
@@ -424,30 +414,6 @@ impl Layer for DeltaLayer {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn contains(&self, key: &Key) -> Result<bool> {
|
|
||||||
// Open the file and lock the metadata in memory
|
|
||||||
let inner = self.load()?;
|
|
||||||
|
|
||||||
// Scan the page versions backwards, starting from `lsn`.
|
|
||||||
let file = inner.file.as_ref().unwrap();
|
|
||||||
let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
|
||||||
inner.index_start_blk,
|
|
||||||
inner.index_root_blk,
|
|
||||||
file,
|
|
||||||
);
|
|
||||||
let search_key = DeltaKey::from_key_lsn(key, Lsn(0));
|
|
||||||
let mut found = false;
|
|
||||||
reader.visit(
|
|
||||||
&search_key.0,
|
|
||||||
VisitDirection::Forwards,
|
|
||||||
|delta_key, _val| {
|
|
||||||
found = DeltaKey::extract_key_from_buf(delta_key) == *key;
|
|
||||||
false
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
Ok(found)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DeltaLayer {
|
impl DeltaLayer {
|
||||||
@@ -590,7 +556,7 @@ impl DeltaLayer {
|
|||||||
|
|
||||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||||
///
|
///
|
||||||
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
|
/// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
|
||||||
pub fn new_for_path<F>(path: &Path, file: F) -> Result<Self>
|
pub fn new_for_path<F>(path: &Path, file: F) -> Result<Self>
|
||||||
where
|
where
|
||||||
F: FileExt,
|
F: FileExt,
|
||||||
@@ -705,13 +671,7 @@ impl DeltaLayerWriter {
|
|||||||
/// The values must be appended in key, lsn order.
|
/// The values must be appended in key, lsn order.
|
||||||
///
|
///
|
||||||
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||||
self.put_value_bytes(
|
self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
|
||||||
key,
|
|
||||||
lsn,
|
|
||||||
&Value::ser(&val)?,
|
|
||||||
val.will_init(),
|
|
||||||
val.is_image(),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_value_bytes(
|
pub fn put_value_bytes(
|
||||||
@@ -720,12 +680,12 @@ impl DeltaLayerWriter {
|
|||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
val: &[u8],
|
val: &[u8],
|
||||||
will_init: bool,
|
will_init: bool,
|
||||||
is_image: bool,
|
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
assert!(self.lsn_range.start <= lsn);
|
assert!(self.lsn_range.start <= lsn);
|
||||||
|
|
||||||
let off = self.blob_writer.write_blob(val)?;
|
let off = self.blob_writer.write_blob(val)?;
|
||||||
let blob_ref = BlobRef::new(off, will_init, is_image);
|
|
||||||
|
let blob_ref = BlobRef::new(off, will_init);
|
||||||
|
|
||||||
let delta_key = DeltaKey::from_key_lsn(&key, lsn);
|
let delta_key = DeltaKey::from_key_lsn(&key, lsn);
|
||||||
self.tree.append(&delta_key.0, blob_ref.0)?;
|
self.tree.append(&delta_key.0, blob_ref.0)?;
|
||||||
@@ -914,7 +874,7 @@ impl Iterator for DeltaKeyIter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> DeltaKeyIter {
|
impl<'a> DeltaKeyIter {
|
||||||
fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>, skip_images: bool) -> Result<Self> {
|
fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
|
||||||
let file = inner.file.as_ref().unwrap();
|
let file = inner.file.as_ref().unwrap();
|
||||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||||
inner.index_start_blk,
|
inner.index_start_blk,
|
||||||
@@ -923,33 +883,29 @@ impl<'a> DeltaKeyIter {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
|
let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
|
||||||
let mut last_pos = 0u64;
|
|
||||||
let mut last_delta: Option<DeltaKey> = None;
|
|
||||||
tree_reader.visit(
|
tree_reader.visit(
|
||||||
&[0u8; DELTA_KEY_SIZE],
|
&[0u8; DELTA_KEY_SIZE],
|
||||||
VisitDirection::Forwards,
|
VisitDirection::Forwards,
|
||||||
|key, value| {
|
|key, value| {
|
||||||
let blob_ref = BlobRef(value);
|
let delta_key = DeltaKey::from_slice(key);
|
||||||
if !blob_ref.is_image() || !skip_images {
|
let pos = BlobRef(value).pos();
|
||||||
let next_delta = DeltaKey::from_slice(key);
|
if let Some(last) = all_keys.last_mut() {
|
||||||
let pos = blob_ref.pos();
|
if last.0.key() == delta_key.key() {
|
||||||
if let Some(prev_delta) = last_delta.take() {
|
return true;
|
||||||
if prev_delta.key() == next_delta.key() {
|
} else {
|
||||||
last_delta = Some(next_delta);
|
// subtract offset of new key BLOB and first blob of this key
|
||||||
return true;
|
// to get total size if values associated with this key
|
||||||
}
|
let first_pos = last.1;
|
||||||
all_keys.push((prev_delta, pos - last_pos));
|
last.1 = pos - first_pos;
|
||||||
}
|
}
|
||||||
last_delta = Some(next_delta);
|
|
||||||
last_pos = pos;
|
|
||||||
}
|
}
|
||||||
|
all_keys.push((delta_key, pos));
|
||||||
true
|
true
|
||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
if let Some(prev_delta) = last_delta.take() {
|
if let Some(last) = all_keys.last_mut() {
|
||||||
// Last key occupies all space till end of layer
|
// Last key occupies all space till end of layer
|
||||||
let file_size = std::fs::metadata(&file.file.path)?.len();
|
last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
|
||||||
all_keys.push((prev_delta, file_size - last_pos));
|
|
||||||
}
|
}
|
||||||
let iter = DeltaKeyIter {
|
let iter = DeltaKeyIter {
|
||||||
all_keys,
|
all_keys,
|
||||||
|
|||||||
@@ -177,7 +177,7 @@ impl fmt::Display for ImageFileName {
|
|||||||
///
|
///
|
||||||
/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
|
/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
|
||||||
/// global config, and paths to layer files are constructed using the tenant/timeline
|
/// global config, and paths to layer files are constructed using the tenant/timeline
|
||||||
/// path from the config. But in the 'dump_layerfile' binary, we need to construct a Layer
|
/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer
|
||||||
/// struct for a file on disk, without having a page server running, so that we have no
|
/// struct for a file on disk, without having a page server running, so that we have no
|
||||||
/// config. In that case, we use the Path variant to hold the full path to the file on
|
/// config. In that case, we use the Path variant to hold the full path to the file on
|
||||||
/// disk.
|
/// disk.
|
||||||
|
|||||||
@@ -223,10 +223,6 @@ impl Layer for ImageLayer {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn contains(&self, key: &Key) -> Result<bool> {
|
|
||||||
Ok(self.get_key_range().contains(key))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ImageLayer {
|
impl ImageLayer {
|
||||||
@@ -361,7 +357,7 @@ impl ImageLayer {
|
|||||||
|
|
||||||
/// Create an ImageLayer struct representing an existing file on disk.
|
/// Create an ImageLayer struct representing an existing file on disk.
|
||||||
///
|
///
|
||||||
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
|
/// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
|
||||||
pub fn new_for_path<F>(path: &Path, file: F) -> Result<ImageLayer>
|
pub fn new_for_path<F>(path: &Path, file: F) -> Result<ImageLayer>
|
||||||
where
|
where
|
||||||
F: std::os::unix::prelude::FileExt,
|
F: std::os::unix::prelude::FileExt,
|
||||||
|
|||||||
@@ -235,11 +235,6 @@ impl Layer for InMemoryLayer {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn contains(&self, key: &Key) -> Result<bool> {
|
|
||||||
let inner = self.inner.read().unwrap();
|
|
||||||
Ok(inner.index.get(key).is_some())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl InMemoryLayer {
|
impl InMemoryLayer {
|
||||||
@@ -363,14 +358,8 @@ impl InMemoryLayer {
|
|||||||
// Write all page versions
|
// Write all page versions
|
||||||
for (lsn, pos) in vec_map.as_slice() {
|
for (lsn, pos) in vec_map.as_slice() {
|
||||||
cursor.read_blob_into_buf(*pos, &mut buf)?;
|
cursor.read_blob_into_buf(*pos, &mut buf)?;
|
||||||
let value = Value::des(&buf)?;
|
let will_init = Value::des(&buf)?.will_init();
|
||||||
delta_layer_writer.put_value_bytes(
|
delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
|
||||||
key,
|
|
||||||
*lsn,
|
|
||||||
&buf,
|
|
||||||
value.will_init(),
|
|
||||||
value.is_image(),
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -15,25 +15,19 @@ use crate::repository::Key;
|
|||||||
use crate::tenant::inmemory_layer::InMemoryLayer;
|
use crate::tenant::inmemory_layer::InMemoryLayer;
|
||||||
use crate::tenant::storage_layer::Layer;
|
use crate::tenant::storage_layer::Layer;
|
||||||
use crate::tenant::storage_layer::{range_eq, range_overlaps};
|
use crate::tenant::storage_layer::{range_eq, range_overlaps};
|
||||||
|
use amplify_num::i256;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use std::collections::{BTreeMap, VecDeque};
|
use num_traits::identities::{One, Zero};
|
||||||
|
use num_traits::{Bounded, Num, Signed};
|
||||||
|
use rstar::{RTree, RTreeObject, AABB};
|
||||||
|
use std::cmp::Ordering;
|
||||||
|
use std::collections::VecDeque;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
|
use std::ops::{Add, Div, Mul, Neg, Rem, Sub};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, PartialOrd, Ord, Eq)]
|
|
||||||
struct BTreeKey {
|
|
||||||
lsn: Lsn,
|
|
||||||
seq: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BTreeKey {
|
|
||||||
fn new(lsn: Lsn) -> BTreeKey {
|
|
||||||
BTreeKey { lsn, seq: 0 }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// LayerMap tracks what layers exist on a timeline.
|
/// LayerMap tracks what layers exist on a timeline.
|
||||||
///
|
///
|
||||||
@@ -59,11 +53,173 @@ pub struct LayerMap {
|
|||||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||||
|
|
||||||
/// All the historic layers are kept here
|
/// All the historic layers are kept here
|
||||||
historic_layers: BTreeMap<BTreeKey, Arc<dyn Layer>>,
|
historic_layers: RTree<LayerRTreeObject>,
|
||||||
layers_seqno: usize,
|
|
||||||
|
|
||||||
/// Latest stored delta layer
|
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
||||||
latest_delta_layer: Option<Arc<dyn Layer>>,
|
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
||||||
|
l0_delta_layers: Vec<Arc<dyn Layer>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct LayerRTreeObject {
|
||||||
|
layer: Arc<dyn Layer>,
|
||||||
|
|
||||||
|
envelope: AABB<[IntKey; 2]>,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Representation of Key as numeric type.
|
||||||
|
// We can not use native implementation of i128, because rstar::RTree
|
||||||
|
// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi).
|
||||||
|
// Overflow will cause panic in debug mode and incorrect area calculation in release mode,
|
||||||
|
// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work).
|
||||||
|
// By using i256 as the type, even though all the actual values would fit in i128, we can be
|
||||||
|
// sure that multiplication doesn't overflow.
|
||||||
|
//
|
||||||
|
|
||||||
|
#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)]
|
||||||
|
struct IntKey(i256);
|
||||||
|
|
||||||
|
impl Copy for IntKey {}
|
||||||
|
|
||||||
|
impl IntKey {
|
||||||
|
fn from(i: i128) -> Self {
|
||||||
|
IntKey(i256::from(i))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Bounded for IntKey {
|
||||||
|
fn min_value() -> Self {
|
||||||
|
IntKey(i256::MIN)
|
||||||
|
}
|
||||||
|
fn max_value() -> Self {
|
||||||
|
IntKey(i256::MAX)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Signed for IntKey {
|
||||||
|
fn is_positive(&self) -> bool {
|
||||||
|
self.0 > i256::ZERO
|
||||||
|
}
|
||||||
|
fn is_negative(&self) -> bool {
|
||||||
|
self.0 < i256::ZERO
|
||||||
|
}
|
||||||
|
fn signum(&self) -> Self {
|
||||||
|
match self.0.cmp(&i256::ZERO) {
|
||||||
|
Ordering::Greater => IntKey(i256::ONE),
|
||||||
|
Ordering::Less => IntKey(-i256::ONE),
|
||||||
|
Ordering::Equal => IntKey(i256::ZERO),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn abs(&self) -> Self {
|
||||||
|
IntKey(self.0.abs())
|
||||||
|
}
|
||||||
|
fn abs_sub(&self, other: &Self) -> Self {
|
||||||
|
if self.0 <= other.0 {
|
||||||
|
IntKey(i256::ZERO)
|
||||||
|
} else {
|
||||||
|
IntKey(self.0 - other.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Neg for IntKey {
|
||||||
|
type Output = Self;
|
||||||
|
fn neg(self) -> Self::Output {
|
||||||
|
IntKey(-self.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Rem for IntKey {
|
||||||
|
type Output = Self;
|
||||||
|
fn rem(self, rhs: Self) -> Self::Output {
|
||||||
|
IntKey(self.0 % rhs.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Div for IntKey {
|
||||||
|
type Output = Self;
|
||||||
|
fn div(self, rhs: Self) -> Self::Output {
|
||||||
|
IntKey(self.0 / rhs.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Add for IntKey {
|
||||||
|
type Output = Self;
|
||||||
|
fn add(self, rhs: Self) -> Self::Output {
|
||||||
|
IntKey(self.0 + rhs.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Sub for IntKey {
|
||||||
|
type Output = Self;
|
||||||
|
fn sub(self, rhs: Self) -> Self::Output {
|
||||||
|
IntKey(self.0 - rhs.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Mul for IntKey {
|
||||||
|
type Output = Self;
|
||||||
|
fn mul(self, rhs: Self) -> Self::Output {
|
||||||
|
IntKey(self.0 * rhs.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl One for IntKey {
|
||||||
|
fn one() -> Self {
|
||||||
|
IntKey(i256::ONE)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Zero for IntKey {
|
||||||
|
fn zero() -> Self {
|
||||||
|
IntKey(i256::ZERO)
|
||||||
|
}
|
||||||
|
fn is_zero(&self) -> bool {
|
||||||
|
self.0 == i256::ZERO
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Num for IntKey {
|
||||||
|
type FromStrRadixErr = <i128 as Num>::FromStrRadixErr;
|
||||||
|
fn from_str_radix(str: &str, radix: u32) -> Result<Self, Self::FromStrRadixErr> {
|
||||||
|
Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for LayerRTreeObject {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
// FIXME: ptr_eq might fail to return true for 'dyn'
|
||||||
|
// references. Clippy complains about this. In practice it
|
||||||
|
// seems to work, the assertion below would be triggered
|
||||||
|
// otherwise but this ought to be fixed.
|
||||||
|
#[allow(clippy::vtable_address_comparisons)]
|
||||||
|
Arc::ptr_eq(&self.layer, &other.layer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RTreeObject for LayerRTreeObject {
|
||||||
|
type Envelope = AABB<[IntKey; 2]>;
|
||||||
|
fn envelope(&self) -> Self::Envelope {
|
||||||
|
self.envelope
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LayerRTreeObject {
|
||||||
|
fn new(layer: Arc<dyn Layer>) -> Self {
|
||||||
|
let key_range = layer.get_key_range();
|
||||||
|
let lsn_range = layer.get_lsn_range();
|
||||||
|
|
||||||
|
let envelope = AABB::from_corners(
|
||||||
|
[
|
||||||
|
IntKey::from(key_range.start.to_i128()),
|
||||||
|
IntKey::from(lsn_range.start.0 as i128),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
IntKey::from(key_range.end.to_i128() - 1),
|
||||||
|
IntKey::from(lsn_range.end.0 as i128 - 1),
|
||||||
|
], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive
|
||||||
|
);
|
||||||
|
LayerRTreeObject { layer, envelope }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return value of LayerMap::search
|
/// Return value of LayerMap::search
|
||||||
@@ -88,17 +244,23 @@ impl LayerMap {
|
|||||||
// linear search
|
// linear search
|
||||||
// Find the latest image layer that covers the given key
|
// Find the latest image layer that covers the given key
|
||||||
let mut latest_img: Option<Arc<dyn Layer>> = None;
|
let mut latest_img: Option<Arc<dyn Layer>> = None;
|
||||||
let mut latest_img_lsn = Lsn(0);
|
let mut latest_img_lsn: Option<Lsn> = None;
|
||||||
let mut iter = self
|
let envelope = AABB::from_corners(
|
||||||
|
[IntKey::from(key.to_i128()), IntKey::from(0i128)],
|
||||||
|
[
|
||||||
|
IntKey::from(key.to_i128()),
|
||||||
|
IntKey::from(end_lsn.0 as i128 - 1),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
for e in self
|
||||||
.historic_layers
|
.historic_layers
|
||||||
.range(BTreeKey::new(Lsn(0))..BTreeKey::new(end_lsn));
|
.locate_in_envelope_intersecting(&envelope)
|
||||||
while let Some((_key, l)) = iter.next_back() {
|
{
|
||||||
|
let l = &e.layer;
|
||||||
if l.is_incremental() {
|
if l.is_incremental() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if !l.get_key_range().contains(&key) {
|
assert!(l.get_key_range().contains(&key));
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let img_lsn = l.get_lsn_range().start;
|
let img_lsn = l.get_lsn_range().start;
|
||||||
assert!(img_lsn < end_lsn);
|
assert!(img_lsn < end_lsn);
|
||||||
if Lsn(img_lsn.0 + 1) == end_lsn {
|
if Lsn(img_lsn.0 + 1) == end_lsn {
|
||||||
@@ -108,23 +270,23 @@ impl LayerMap {
|
|||||||
lsn_floor: img_lsn,
|
lsn_floor: img_lsn,
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
latest_img = Some(Arc::clone(l));
|
if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
|
||||||
latest_img_lsn = img_lsn;
|
latest_img = Some(Arc::clone(l));
|
||||||
break;
|
latest_img_lsn = Some(img_lsn);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Search the delta layers
|
// Search the delta layers
|
||||||
let mut latest_delta: Option<Arc<dyn Layer>> = None;
|
let mut latest_delta: Option<Arc<dyn Layer>> = None;
|
||||||
let mut iter = self
|
for e in self
|
||||||
.historic_layers
|
.historic_layers
|
||||||
.range(BTreeKey::new(Lsn(0))..BTreeKey::new(end_lsn));
|
.locate_in_envelope_intersecting(&envelope)
|
||||||
while let Some((_key, l)) = iter.next_back() {
|
{
|
||||||
|
let l = &e.layer;
|
||||||
if !l.is_incremental() {
|
if !l.is_incremental() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if !l.get_key_range().contains(&key) {
|
assert!(l.get_key_range().contains(&key));
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if l.get_lsn_range().start >= end_lsn {
|
if l.get_lsn_range().start >= end_lsn {
|
||||||
info!(
|
info!(
|
||||||
"Candidate delta layer {}..{} is too new for lsn {}",
|
"Candidate delta layer {}..{} is too new for lsn {}",
|
||||||
@@ -134,9 +296,6 @@ impl LayerMap {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
assert!(l.get_lsn_range().start < end_lsn);
|
assert!(l.get_lsn_range().start < end_lsn);
|
||||||
if l.get_lsn_range().end <= latest_img_lsn {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if l.get_lsn_range().end >= end_lsn {
|
if l.get_lsn_range().end >= end_lsn {
|
||||||
// this layer contains the requested point in the key/lsn space.
|
// this layer contains the requested point in the key/lsn space.
|
||||||
// No need to search any further
|
// No need to search any further
|
||||||
@@ -162,7 +321,10 @@ impl LayerMap {
|
|||||||
"found (old) layer {} for request on {key} at {end_lsn}",
|
"found (old) layer {} for request on {key} at {end_lsn}",
|
||||||
l.filename().display(),
|
l.filename().display(),
|
||||||
);
|
);
|
||||||
let lsn_floor = std::cmp::max(Lsn(latest_img_lsn.0 + 1), l.get_lsn_range().start);
|
let lsn_floor = std::cmp::max(
|
||||||
|
Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
|
||||||
|
l.get_lsn_range().start,
|
||||||
|
);
|
||||||
Ok(Some(SearchResult {
|
Ok(Some(SearchResult {
|
||||||
lsn_floor,
|
lsn_floor,
|
||||||
layer: l,
|
layer: l,
|
||||||
@@ -170,7 +332,7 @@ impl LayerMap {
|
|||||||
} else if let Some(l) = latest_img {
|
} else if let Some(l) = latest_img {
|
||||||
trace!("found img layer and no deltas for request on {key} at {end_lsn}");
|
trace!("found img layer and no deltas for request on {key} at {end_lsn}");
|
||||||
Ok(Some(SearchResult {
|
Ok(Some(SearchResult {
|
||||||
lsn_floor: latest_img_lsn,
|
lsn_floor: latest_img_lsn.unwrap(),
|
||||||
layer: l,
|
layer: l,
|
||||||
}))
|
}))
|
||||||
} else {
|
} else {
|
||||||
@@ -184,28 +346,9 @@ impl LayerMap {
|
|||||||
///
|
///
|
||||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||||
if layer.get_key_range() == (Key::MIN..Key::MAX) {
|
if layer.get_key_range() == (Key::MIN..Key::MAX) {
|
||||||
self.latest_delta_layer = Some(layer.clone());
|
self.l0_delta_layers.push(layer.clone());
|
||||||
} else if !layer.is_incremental() {
|
|
||||||
// If latest delta layer is followed by image layers
|
|
||||||
// then reset it, preventing generation of partial image layer
|
|
||||||
if let Some(latest_delta) = &self.latest_delta_layer {
|
|
||||||
// May be it is more correct to use contains() rather than inrestects
|
|
||||||
// but one delta layer can be covered by several image layers.
|
|
||||||
let kr1 = layer.get_key_range();
|
|
||||||
let kr2 = latest_delta.get_key_range();
|
|
||||||
if range_overlaps(&kr1, &kr2) {
|
|
||||||
self.latest_delta_layer = None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
self.historic_layers.insert(
|
self.historic_layers.insert(LayerRTreeObject::new(layer));
|
||||||
BTreeKey {
|
|
||||||
lsn: layer.get_lsn_range().start,
|
|
||||||
seq: self.layers_seqno,
|
|
||||||
},
|
|
||||||
layer,
|
|
||||||
);
|
|
||||||
self.layers_seqno += 1;
|
|
||||||
NUM_ONDISK_LAYERS.inc();
|
NUM_ONDISK_LAYERS.inc();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -216,33 +359,21 @@ impl LayerMap {
|
|||||||
///
|
///
|
||||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||||
if layer.get_key_range() == (Key::MIN..Key::MAX) {
|
if layer.get_key_range() == (Key::MIN..Key::MAX) {
|
||||||
if let Some(latest_layer) = &self.latest_delta_layer {
|
let len_before = self.l0_delta_layers.len();
|
||||||
#[allow(clippy::vtable_address_comparisons)]
|
|
||||||
if Arc::ptr_eq(&layer, latest_layer) {
|
// FIXME: ptr_eq might fail to return true for 'dyn'
|
||||||
self.latest_delta_layer = None;
|
// references. Clippy complains about this. In practice it
|
||||||
}
|
// seems to work, the assertion below would be triggered
|
||||||
}
|
// otherwise but this ought to be fixed.
|
||||||
|
#[allow(clippy::vtable_address_comparisons)]
|
||||||
|
self.l0_delta_layers
|
||||||
|
.retain(|other| !Arc::ptr_eq(other, &layer));
|
||||||
|
assert_eq!(self.l0_delta_layers.len(), len_before - 1);
|
||||||
}
|
}
|
||||||
let len_before = self.historic_layers.len();
|
assert!(self
|
||||||
#[allow(clippy::vtable_address_comparisons)]
|
.historic_layers
|
||||||
self.historic_layers
|
.remove(&LayerRTreeObject::new(layer))
|
||||||
.retain(|_key, other| !Arc::ptr_eq(other, &layer));
|
.is_some());
|
||||||
if self.historic_layers.len() != len_before - 1 {
|
|
||||||
assert!(self.historic_layers.len() == len_before);
|
|
||||||
error!(
|
|
||||||
"Failed to remove {} layer: {}..{}__{}..{}",
|
|
||||||
if layer.is_incremental() {
|
|
||||||
"inremental"
|
|
||||||
} else {
|
|
||||||
"image"
|
|
||||||
},
|
|
||||||
layer.get_key_range().start,
|
|
||||||
layer.get_key_range().end,
|
|
||||||
layer.get_lsn_range().start,
|
|
||||||
layer.get_lsn_range().end
|
|
||||||
);
|
|
||||||
}
|
|
||||||
assert!(self.historic_layers.len() == len_before - 1);
|
|
||||||
NUM_ONDISK_LAYERS.dec();
|
NUM_ONDISK_LAYERS.dec();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -259,10 +390,21 @@ impl LayerMap {
|
|||||||
|
|
||||||
loop {
|
loop {
|
||||||
let mut made_progress = false;
|
let mut made_progress = false;
|
||||||
for (_key, l) in self
|
let envelope = AABB::from_corners(
|
||||||
|
[
|
||||||
|
IntKey::from(range_remain.start.to_i128()),
|
||||||
|
IntKey::from(lsn_range.start.0 as i128),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
IntKey::from(range_remain.end.to_i128() - 1),
|
||||||
|
IntKey::from(lsn_range.end.0 as i128 - 1),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
for e in self
|
||||||
.historic_layers
|
.historic_layers
|
||||||
.range(BTreeKey::new(lsn_range.start)..BTreeKey::new(lsn_range.end))
|
.locate_in_envelope_intersecting(&envelope)
|
||||||
{
|
{
|
||||||
|
let l = &e.layer;
|
||||||
if l.is_incremental() {
|
if l.is_incremental() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -285,30 +427,39 @@ impl LayerMap {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<dyn Layer>> {
|
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<dyn Layer>> {
|
||||||
self.historic_layers
|
self.historic_layers.iter().map(|e| e.layer.clone())
|
||||||
.iter()
|
|
||||||
.map(|(_key, layer)| layer.clone())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Find the last image layer that covers 'key', ignoring any image layers
|
/// Find the last image layer that covers 'key', ignoring any image layers
|
||||||
/// newer than 'lsn'.
|
/// newer than 'lsn'.
|
||||||
fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||||
let mut iter = self
|
let mut candidate_lsn = Lsn(0);
|
||||||
|
let mut candidate = None;
|
||||||
|
let envelope = AABB::from_corners(
|
||||||
|
[IntKey::from(key.to_i128()), IntKey::from(0)],
|
||||||
|
[IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)],
|
||||||
|
);
|
||||||
|
for e in self
|
||||||
.historic_layers
|
.historic_layers
|
||||||
.range(BTreeKey::new(Lsn(0))..BTreeKey::new(lsn + 1));
|
.locate_in_envelope_intersecting(&envelope)
|
||||||
while let Some((_key, l)) = iter.next_back() {
|
{
|
||||||
|
let l = &e.layer;
|
||||||
if l.is_incremental() {
|
if l.is_incremental() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if !l.get_key_range().contains(&key) {
|
assert!(l.get_key_range().contains(&key));
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let this_lsn = l.get_lsn_range().start;
|
let this_lsn = l.get_lsn_range().start;
|
||||||
assert!(this_lsn <= lsn);
|
assert!(this_lsn <= lsn);
|
||||||
return Some(Arc::clone(l));
|
if this_lsn < candidate_lsn {
|
||||||
|
// our previous candidate was better
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
candidate_lsn = this_lsn;
|
||||||
|
candidate = Some(Arc::clone(l));
|
||||||
}
|
}
|
||||||
None
|
|
||||||
|
candidate
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -325,10 +476,18 @@ impl LayerMap {
|
|||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
|
) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
|
||||||
let mut points = vec![key_range.start];
|
let mut points = vec![key_range.start];
|
||||||
for (_lsn, l) in self
|
let envelope = AABB::from_corners(
|
||||||
|
[IntKey::from(key_range.start.to_i128()), IntKey::from(0)],
|
||||||
|
[
|
||||||
|
IntKey::from(key_range.end.to_i128()),
|
||||||
|
IntKey::from(lsn.0 as i128),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
for e in self
|
||||||
.historic_layers
|
.historic_layers
|
||||||
.range(BTreeKey::new(Lsn(0))..BTreeKey::new(lsn + 1))
|
.locate_in_envelope_intersecting(&envelope)
|
||||||
{
|
{
|
||||||
|
let l = &e.layer;
|
||||||
assert!(l.get_lsn_range().start <= lsn);
|
assert!(l.get_lsn_range().start <= lsn);
|
||||||
let range = l.get_key_range();
|
let range = l.get_key_range();
|
||||||
if key_range.contains(&range.start) {
|
if key_range.contains(&range.start) {
|
||||||
@@ -365,17 +524,26 @@ impl LayerMap {
|
|||||||
if lsn_range.start >= lsn_range.end {
|
if lsn_range.start >= lsn_range.end {
|
||||||
return Ok(0);
|
return Ok(0);
|
||||||
}
|
}
|
||||||
for (_lsn, l) in self
|
let envelope = AABB::from_corners(
|
||||||
|
[
|
||||||
|
IntKey::from(key_range.start.to_i128()),
|
||||||
|
IntKey::from(lsn_range.start.0 as i128),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
IntKey::from(key_range.end.to_i128() - 1),
|
||||||
|
IntKey::from(lsn_range.end.0 as i128 - 1),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
for e in self
|
||||||
.historic_layers
|
.historic_layers
|
||||||
.range(BTreeKey::new(lsn_range.start)..BTreeKey::new(lsn_range.end))
|
.locate_in_envelope_intersecting(&envelope)
|
||||||
{
|
{
|
||||||
|
let l = &e.layer;
|
||||||
if !l.is_incremental() {
|
if !l.is_incremental() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if !range_overlaps(&l.get_key_range(), key_range) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
assert!(range_overlaps(&l.get_lsn_range(), lsn_range));
|
assert!(range_overlaps(&l.get_lsn_range(), lsn_range));
|
||||||
|
assert!(range_overlaps(&l.get_key_range(), key_range));
|
||||||
|
|
||||||
// We ignore level0 delta layers. Unless the whole keyspace fits
|
// We ignore level0 delta layers. Unless the whole keyspace fits
|
||||||
// into one partition
|
// into one partition
|
||||||
@@ -391,8 +559,8 @@ impl LayerMap {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Return all L0 delta layers
|
/// Return all L0 delta layers
|
||||||
pub fn get_latest_delta_layer(&mut self) -> Option<Arc<dyn Layer>> {
|
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
|
||||||
self.latest_delta_layer.take()
|
Ok(self.l0_delta_layers.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// debugging function to print out the contents of the layer map
|
/// debugging function to print out the contents of the layer map
|
||||||
@@ -411,8 +579,8 @@ impl LayerMap {
|
|||||||
}
|
}
|
||||||
|
|
||||||
println!("historic_layers:");
|
println!("historic_layers:");
|
||||||
for (_key, layer) in self.historic_layers.iter() {
|
for e in self.historic_layers.iter() {
|
||||||
layer.dump(verbose)?;
|
e.layer.dump(verbose)?;
|
||||||
}
|
}
|
||||||
println!("End dump LayerMap");
|
println!("End dump LayerMap");
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -139,9 +139,9 @@ pub trait Layer: Send + Sync {
|
|||||||
/// Iterate through all keys and values stored in the layer
|
/// Iterate through all keys and values stored in the layer
|
||||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
|
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
|
||||||
|
|
||||||
/// Iterate through all keys stored in the layer. Returns key, lsn and value size.
|
/// Iterate through all keys stored in the layer. Returns key, lsn and value size
|
||||||
/// It is used only for reconstruction and so is currently implemented only for DeltaLayer
|
/// It is used only for compaction and so is currently implemented only for DeltaLayer
|
||||||
fn key_iter(&self, _skip_images: bool) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
|
fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
|
||||||
panic!("Not implemented")
|
panic!("Not implemented")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -150,7 +150,4 @@ pub trait Layer: Send + Sync {
|
|||||||
|
|
||||||
/// Dump summary of the contents of the layer to stdout
|
/// Dump summary of the contents of the layer to stdout
|
||||||
fn dump(&self, verbose: bool) -> Result<()>;
|
fn dump(&self, verbose: bool) -> Result<()>;
|
||||||
|
|
||||||
// Check if ayer contains particular key
|
|
||||||
fn contains(&self, key: &Key) -> Result<bool>;
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
|
use itertools::Itertools;
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
use tokio::task::spawn_blocking;
|
use tokio::task::spawn_blocking;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
@@ -51,7 +52,10 @@ use crate::task_mgr::TaskKind;
|
|||||||
use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task};
|
use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task};
|
||||||
use crate::walredo::WalRedoManager;
|
use crate::walredo::WalRedoManager;
|
||||||
use crate::CheckpointConfig;
|
use crate::CheckpointConfig;
|
||||||
use crate::{page_cache, storage_sync};
|
use crate::{
|
||||||
|
page_cache,
|
||||||
|
storage_sync::{self, index::LayerFileMetadata},
|
||||||
|
};
|
||||||
|
|
||||||
pub struct Timeline {
|
pub struct Timeline {
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
@@ -118,7 +122,7 @@ pub struct Timeline {
|
|||||||
|
|
||||||
/// Layer removal lock.
|
/// Layer removal lock.
|
||||||
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
||||||
/// This lock is acquired in [`Timeline::gc`], [`Timeline::reconstruct`],
|
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
|
||||||
/// and [`Tenant::delete_timeline`].
|
/// and [`Tenant::delete_timeline`].
|
||||||
layer_removal_cs: Mutex<()>,
|
layer_removal_cs: Mutex<()>,
|
||||||
|
|
||||||
@@ -468,16 +472,12 @@ impl Timeline {
|
|||||||
CheckpointConfig::Forced => {
|
CheckpointConfig::Forced => {
|
||||||
self.freeze_inmem_layer(false);
|
self.freeze_inmem_layer(false);
|
||||||
self.flush_frozen_layers(true)?;
|
self.flush_frozen_layers(true)?;
|
||||||
self.reconstruct()
|
self.compact()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Mutate the timeline with a [`TimelineWriter`].
|
/// Mutate the timeline with a [`TimelineWriter`].
|
||||||
///
|
|
||||||
/// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter
|
|
||||||
/// is a generic type in this trait. But that doesn't currently work in
|
|
||||||
/// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html
|
|
||||||
pub fn writer(&self) -> TimelineWriter<'_> {
|
pub fn writer(&self) -> TimelineWriter<'_> {
|
||||||
TimelineWriter {
|
TimelineWriter {
|
||||||
tl: self,
|
tl: self,
|
||||||
@@ -509,6 +509,13 @@ impl Timeline {
|
|||||||
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_compaction_threshold(&self) -> usize {
|
||||||
|
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||||
|
tenant_conf
|
||||||
|
.compaction_threshold
|
||||||
|
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
||||||
|
}
|
||||||
|
|
||||||
fn get_image_creation_threshold(&self) -> usize {
|
fn get_image_creation_threshold(&self) -> usize {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
@@ -589,7 +596,7 @@ impl Timeline {
|
|||||||
last_received_wal: Mutex::new(None),
|
last_received_wal: Mutex::new(None),
|
||||||
rel_size_cache: RwLock::new(HashMap::new()),
|
rel_size_cache: RwLock::new(HashMap::new()),
|
||||||
};
|
};
|
||||||
result.repartition_threshold = result.get_checkpoint_distance() * 3;
|
result.repartition_threshold = result.get_checkpoint_distance() / 10;
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -723,7 +730,7 @@ impl Timeline {
|
|||||||
pub fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
|
pub fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
|
||||||
self.layer_removal_cs
|
self.layer_removal_cs
|
||||||
.try_lock()
|
.try_lock()
|
||||||
.map_err(|e| anyhow!("cannot lock reconstruction critical section {e}"))
|
.map_err(|e| anyhow!("cannot lock compaction critical section {e}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Retrieve current logical size of the timeline.
|
/// Retrieve current logical size of the timeline.
|
||||||
@@ -1186,8 +1193,8 @@ impl Timeline {
|
|||||||
self.create_image_layers(&partitioning, self.initdb_lsn, true)?
|
self.create_image_layers(&partitioning, self.initdb_lsn, true)?
|
||||||
} else {
|
} else {
|
||||||
// normal case, write out a L0 delta layer file.
|
// normal case, write out a L0 delta layer file.
|
||||||
let delta_path = self.create_delta_layer(&frozen_layer)?;
|
let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
|
||||||
HashSet::from([delta_path])
|
HashMap::from([(delta_path, metadata)])
|
||||||
};
|
};
|
||||||
|
|
||||||
fail_point!("flush-frozen-before-sync");
|
fail_point!("flush-frozen-before-sync");
|
||||||
@@ -1213,85 +1220,86 @@ impl Timeline {
|
|||||||
// TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
|
// TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
|
||||||
// *all* the layers, to avoid fsyncing the file multiple times.
|
// *all* the layers, to avoid fsyncing the file multiple times.
|
||||||
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
||||||
self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?;
|
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||||
|
|
||||||
|
// If we were able to advance 'disk_consistent_lsn', save it the metadata file.
|
||||||
|
// After crash, we will restart WAL streaming and processing from that point.
|
||||||
|
if disk_consistent_lsn != old_disk_consistent_lsn {
|
||||||
|
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
|
||||||
|
self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload)?;
|
||||||
|
// Also update the in-memory copy
|
||||||
|
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Update metadata file
|
/// Update metadata file
|
||||||
fn update_disk_consistent_lsn(
|
fn update_metadata_file(
|
||||||
&self,
|
&self,
|
||||||
disk_consistent_lsn: Lsn,
|
disk_consistent_lsn: Lsn,
|
||||||
layer_paths_to_upload: HashSet<PathBuf>,
|
layer_paths_to_upload: HashMap<PathBuf, LayerFileMetadata>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// If we were able to advance 'disk_consistent_lsn', save it the metadata file.
|
// We can only save a valid 'prev_record_lsn' value on disk if we
|
||||||
// After crash, we will restart WAL streaming and processing from that point.
|
// flushed *all* in-memory changes to disk. We only track
|
||||||
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
// 'prev_record_lsn' in memory for the latest processed record, so we
|
||||||
if disk_consistent_lsn != old_disk_consistent_lsn {
|
// don't remember what the correct value that corresponds to some old
|
||||||
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
|
// LSN is. But if we flush everything, then the value corresponding
|
||||||
|
// current 'last_record_lsn' is correct and we can store it on disk.
|
||||||
|
let RecordLsn {
|
||||||
|
last: last_record_lsn,
|
||||||
|
prev: prev_record_lsn,
|
||||||
|
} = self.last_record_lsn.load();
|
||||||
|
let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn {
|
||||||
|
Some(prev_record_lsn)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
// We can only save a valid 'prev_record_lsn' value on disk if we
|
let ancestor_timeline_id = self
|
||||||
// flushed *all* in-memory changes to disk. We only track
|
.ancestor_timeline
|
||||||
// 'prev_record_lsn' in memory for the latest processed record, so we
|
.as_ref()
|
||||||
// don't remember what the correct value that corresponds to some old
|
.map(|ancestor| ancestor.timeline_id);
|
||||||
// LSN is. But if we flush everything, then the value corresponding
|
|
||||||
// current 'last_record_lsn' is correct and we can store it on disk.
|
|
||||||
let RecordLsn {
|
|
||||||
last: last_record_lsn,
|
|
||||||
prev: prev_record_lsn,
|
|
||||||
} = self.last_record_lsn.load();
|
|
||||||
let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn {
|
|
||||||
Some(prev_record_lsn)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
let ancestor_timeline_id = self
|
let metadata = TimelineMetadata::new(
|
||||||
.ancestor_timeline
|
disk_consistent_lsn,
|
||||||
.as_ref()
|
ondisk_prev_record_lsn,
|
||||||
.map(|ancestor| ancestor.timeline_id);
|
ancestor_timeline_id,
|
||||||
|
self.ancestor_lsn,
|
||||||
|
*self.latest_gc_cutoff_lsn.read(),
|
||||||
|
self.initdb_lsn,
|
||||||
|
self.pg_version,
|
||||||
|
);
|
||||||
|
|
||||||
let metadata = TimelineMetadata::new(
|
fail_point!("checkpoint-before-saving-metadata", |x| bail!(
|
||||||
disk_consistent_lsn,
|
"{}",
|
||||||
ondisk_prev_record_lsn,
|
x.unwrap()
|
||||||
ancestor_timeline_id,
|
));
|
||||||
self.ancestor_lsn,
|
|
||||||
*self.latest_gc_cutoff_lsn.read(),
|
|
||||||
self.initdb_lsn,
|
|
||||||
self.pg_version,
|
|
||||||
);
|
|
||||||
|
|
||||||
fail_point!("checkpoint-before-saving-metadata", |x| bail!(
|
save_metadata(
|
||||||
"{}",
|
self.conf,
|
||||||
x.unwrap()
|
self.timeline_id,
|
||||||
));
|
self.tenant_id,
|
||||||
|
&metadata,
|
||||||
|
false,
|
||||||
|
)?;
|
||||||
|
|
||||||
save_metadata(
|
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||||
self.conf,
|
storage_sync::schedule_layer_upload(
|
||||||
self.timeline_id,
|
|
||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
&metadata,
|
self.timeline_id,
|
||||||
false,
|
layer_paths_to_upload,
|
||||||
)?;
|
Some(metadata),
|
||||||
|
);
|
||||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
|
||||||
storage_sync::schedule_layer_upload(
|
|
||||||
self.tenant_id,
|
|
||||||
self.timeline_id,
|
|
||||||
layer_paths_to_upload,
|
|
||||||
Some(metadata),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Also update the in-memory copy
|
|
||||||
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write out the given frozen in-memory layer as a new L0 delta file
|
// Write out the given frozen in-memory layer as a new L0 delta file
|
||||||
fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result<PathBuf> {
|
fn create_delta_layer(
|
||||||
|
&self,
|
||||||
|
frozen_layer: &InMemoryLayer,
|
||||||
|
) -> Result<(PathBuf, LayerFileMetadata)> {
|
||||||
// Write it out
|
// Write it out
|
||||||
let new_delta = frozen_layer.write_to_disk()?;
|
let new_delta = frozen_layer.write_to_disk()?;
|
||||||
let new_delta_path = new_delta.path();
|
let new_delta_path = new_delta.path();
|
||||||
@@ -1317,25 +1325,26 @@ impl Timeline {
|
|||||||
|
|
||||||
// update the timeline's physical size
|
// update the timeline's physical size
|
||||||
let sz = new_delta_path.metadata()?.len();
|
let sz = new_delta_path.metadata()?.len();
|
||||||
|
|
||||||
self.metrics.current_physical_size_gauge.add(sz);
|
self.metrics.current_physical_size_gauge.add(sz);
|
||||||
// update metrics
|
// update metrics
|
||||||
self.metrics.num_persistent_files_created.inc_by(1);
|
self.metrics.num_persistent_files_created.inc_by(1);
|
||||||
self.metrics.persistent_bytes_written.inc_by(sz);
|
self.metrics.persistent_bytes_written.inc_by(sz);
|
||||||
|
|
||||||
Ok(new_delta_path)
|
Ok((new_delta_path, LayerFileMetadata::new(sz)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn reconstruct(&self) -> anyhow::Result<()> {
|
pub fn compact(&self) -> anyhow::Result<()> {
|
||||||
let last_record_lsn = self.get_last_record_lsn();
|
let last_record_lsn = self.get_last_record_lsn();
|
||||||
|
|
||||||
// Last record Lsn could be zero in case the timelie was just created
|
// Last record Lsn could be zero in case the timelie was just created
|
||||||
if !last_record_lsn.is_valid() {
|
if !last_record_lsn.is_valid() {
|
||||||
warn!("Skipping reconstruction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
|
warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// High level strategy for reconstruction / image creation:
|
// High level strategy for compaction / image creation:
|
||||||
//
|
//
|
||||||
// 1. First, calculate the desired "partitioning" of the
|
// 1. First, calculate the desired "partitioning" of the
|
||||||
// currently in-use key space. The goal is to partition the
|
// currently in-use key space. The goal is to partition the
|
||||||
@@ -1359,13 +1368,13 @@ impl Timeline {
|
|||||||
// total in the delta file. Or perhaps: if creating an image
|
// total in the delta file. Or perhaps: if creating an image
|
||||||
// file would allow to delete some older files.
|
// file would allow to delete some older files.
|
||||||
//
|
//
|
||||||
// 3. After that, we reconstruct all level0 delta files if there
|
// 3. After that, we compact all level0 delta files if there
|
||||||
// are too many of them. While reconstructing, we also garbage
|
// are too many of them. While compacting, we also garbage
|
||||||
// collect any page versions that are no longer needed because
|
// collect any page versions that are no longer needed because
|
||||||
// of the new image layers we created in step 2.
|
// of the new image layers we created in step 2.
|
||||||
//
|
//
|
||||||
// TODO: This high level strategy hasn't been implemented yet.
|
// TODO: This high level strategy hasn't been implemented yet.
|
||||||
// Below are functions reconstruct_level0() and create_image_layers()
|
// Below are functions compact_level0() and create_image_layers()
|
||||||
// but they are a bit ad hoc and don't quite work like it's explained
|
// but they are a bit ad hoc and don't quite work like it's explained
|
||||||
// above. Rewrite it.
|
// above. Rewrite it.
|
||||||
let _layer_removal_cs = self.layer_removal_cs.lock().unwrap();
|
let _layer_removal_cs = self.layer_removal_cs.lock().unwrap();
|
||||||
@@ -1388,25 +1397,25 @@ impl Timeline {
|
|||||||
storage_sync::schedule_layer_upload(
|
storage_sync::schedule_layer_upload(
|
||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
HashSet::from_iter(layer_paths_to_upload),
|
layer_paths_to_upload,
|
||||||
None,
|
None,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 3. Compact
|
||||||
|
let timer = self.metrics.compact_time_histo.start_timer();
|
||||||
|
self.compact_level0(target_file_size)?;
|
||||||
|
timer.stop_and_record();
|
||||||
}
|
}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
// no partitioning? This is normal, if the timeline was just created
|
// no partitioning? This is normal, if the timeline was just created
|
||||||
// as an empty timeline. Also in unit tests, when we use the timeline
|
// as an empty timeline. Also in unit tests, when we use the timeline
|
||||||
// as a simple key-value store, ignoring the datadir layout. Log the
|
// as a simple key-value store, ignoring the datadir layout. Log the
|
||||||
// error but continue.
|
// error but continue.
|
||||||
error!("could not reconstruct, repartitioning keyspace failed: {err:?}");
|
error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// 3. Reconstruct
|
|
||||||
let timer = self.metrics.reconstruct_time_histo.start_timer();
|
|
||||||
self.reconstruct_level0(target_file_size)?;
|
|
||||||
timer.stop_and_record();
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1469,10 +1478,9 @@ impl Timeline {
|
|||||||
partitioning: &KeyPartitioning,
|
partitioning: &KeyPartitioning,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
force: bool,
|
force: bool,
|
||||||
) -> Result<HashSet<PathBuf>> {
|
) -> Result<HashMap<PathBuf, LayerFileMetadata>> {
|
||||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||||
let mut image_layers: Vec<ImageLayer> = Vec::new();
|
let mut image_layers: Vec<ImageLayer> = Vec::new();
|
||||||
let mut layer_paths_to_upload = HashSet::new();
|
|
||||||
for partition in partitioning.parts.iter() {
|
for partition in partitioning.parts.iter() {
|
||||||
if force || self.time_for_new_image_layer(partition, lsn)? {
|
if force || self.time_for_new_image_layer(partition, lsn)? {
|
||||||
let img_range =
|
let img_range =
|
||||||
@@ -1494,7 +1502,6 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
let image_layer = image_layer_writer.finish()?;
|
let image_layer = image_layer_writer.finish()?;
|
||||||
layer_paths_to_upload.insert(image_layer.path());
|
|
||||||
image_layers.push(image_layer);
|
image_layers.push(image_layer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1506,17 +1513,27 @@ impl Timeline {
|
|||||||
// We must also fsync the timeline dir to ensure the directory entries for
|
// We must also fsync the timeline dir to ensure the directory entries for
|
||||||
// new layer files are durable
|
// new layer files are durable
|
||||||
//
|
//
|
||||||
// Reconstruction creates multiple image layers. It would be better to create them all
|
// Compaction creates multiple image layers. It would be better to create them all
|
||||||
// and fsync them all in parallel.
|
// and fsync them all in parallel.
|
||||||
let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone());
|
let all_paths = image_layers
|
||||||
all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
|
.iter()
|
||||||
|
.map(|layer| layer.path())
|
||||||
|
.chain(std::iter::once(
|
||||||
|
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||||
|
))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
par_fsync::par_fsync(&all_paths)?;
|
par_fsync::par_fsync(&all_paths)?;
|
||||||
|
|
||||||
|
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
|
||||||
|
|
||||||
let mut layers = self.layers.write().unwrap();
|
let mut layers = self.layers.write().unwrap();
|
||||||
for l in image_layers {
|
for l in image_layers {
|
||||||
self.metrics
|
let path = l.path();
|
||||||
.current_physical_size_gauge
|
let metadata = path.metadata()?;
|
||||||
.add(l.path().metadata()?.len());
|
|
||||||
|
layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
|
||||||
|
|
||||||
|
self.metrics.current_physical_size_gauge.add(metadata.len());
|
||||||
layers.insert_historic(Arc::new(l));
|
layers.insert_historic(Arc::new(l));
|
||||||
}
|
}
|
||||||
drop(layers);
|
drop(layers);
|
||||||
@@ -1526,46 +1543,230 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Collect a bunch of Level 0 layer files, and reconstruct and reshuffle them as
|
/// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
|
||||||
/// as Level 1 files.
|
/// as Level 1 files.
|
||||||
///
|
///
|
||||||
fn reconstruct_level0(&self, target_file_size: u64) -> Result<()> {
|
fn compact_level0(&self, target_file_size: u64) -> Result<()> {
|
||||||
let mut layers = self.layers.write().unwrap();
|
let layers = self.layers.read().unwrap();
|
||||||
let latest_delta_layer = layers.get_latest_delta_layer();
|
let mut level0_deltas = layers.get_level0_deltas()?;
|
||||||
drop(layers);
|
drop(layers);
|
||||||
let mut writer: Option<DeltaLayerWriter> = None;
|
|
||||||
|
// Only compact if enough layers have accumulated.
|
||||||
|
if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Gather the files to compact in this iteration.
|
||||||
|
//
|
||||||
|
// Start with the oldest Level 0 delta file, and collect any other
|
||||||
|
// level 0 files that form a contiguous sequence, such that the end
|
||||||
|
// LSN of previous file matches the start LSN of the next file.
|
||||||
|
//
|
||||||
|
// Note that if the files don't form such a sequence, we might
|
||||||
|
// "compact" just a single file. That's a bit pointless, but it allows
|
||||||
|
// us to get rid of the level 0 file, and compact the other files on
|
||||||
|
// the next iteration. This could probably made smarter, but such
|
||||||
|
// "gaps" in the sequence of level 0 files should only happen in case
|
||||||
|
// of a crash, partial download from cloud storage, or something like
|
||||||
|
// that, so it's not a big deal in practice.
|
||||||
|
level0_deltas.sort_by_key(|l| l.get_lsn_range().start);
|
||||||
|
let mut level0_deltas_iter = level0_deltas.iter();
|
||||||
|
|
||||||
|
let first_level0_delta = level0_deltas_iter.next().unwrap();
|
||||||
|
let mut prev_lsn_end = first_level0_delta.get_lsn_range().end;
|
||||||
|
let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)];
|
||||||
|
for l in level0_deltas_iter {
|
||||||
|
let lsn_range = l.get_lsn_range();
|
||||||
|
|
||||||
|
if lsn_range.start != prev_lsn_end {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
deltas_to_compact.push(Arc::clone(l));
|
||||||
|
prev_lsn_end = lsn_range.end;
|
||||||
|
}
|
||||||
|
let lsn_range = Range {
|
||||||
|
start: deltas_to_compact.first().unwrap().get_lsn_range().start,
|
||||||
|
end: deltas_to_compact.last().unwrap().get_lsn_range().end,
|
||||||
|
};
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
|
||||||
|
lsn_range.start,
|
||||||
|
lsn_range.end,
|
||||||
|
deltas_to_compact.len(),
|
||||||
|
level0_deltas.len()
|
||||||
|
);
|
||||||
|
for l in deltas_to_compact.iter() {
|
||||||
|
info!("compact includes {}", l.filename().display());
|
||||||
|
}
|
||||||
|
// We don't need the original list of layers anymore. Drop it so that
|
||||||
|
// we don't accidentally use it later in the function.
|
||||||
|
drop(level0_deltas);
|
||||||
|
|
||||||
|
// This iterator walks through all key-value pairs from all the layers
|
||||||
|
// we're compacting, in key, LSN order.
|
||||||
|
let all_values_iter = deltas_to_compact
|
||||||
|
.iter()
|
||||||
|
.map(|l| l.iter())
|
||||||
|
.kmerge_by(|a, b| {
|
||||||
|
if let Ok((a_key, a_lsn, _)) = a {
|
||||||
|
if let Ok((b_key, b_lsn, _)) = b {
|
||||||
|
match a_key.cmp(b_key) {
|
||||||
|
Ordering::Less => true,
|
||||||
|
Ordering::Equal => a_lsn <= b_lsn,
|
||||||
|
Ordering::Greater => false,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// This iterator walks through all keys and is needed to calculate size used by each key
|
||||||
|
let mut all_keys_iter = deltas_to_compact
|
||||||
|
.iter()
|
||||||
|
.map(|l| l.key_iter())
|
||||||
|
.kmerge_by(|a, b| {
|
||||||
|
let (a_key, a_lsn, _) = a;
|
||||||
|
let (b_key, b_lsn, _) = b;
|
||||||
|
match a_key.cmp(b_key) {
|
||||||
|
Ordering::Less => true,
|
||||||
|
Ordering::Equal => a_lsn <= b_lsn,
|
||||||
|
Ordering::Greater => false,
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Merge the contents of all the input delta layers into a new set
|
||||||
|
// of delta layers, based on the current partitioning.
|
||||||
|
//
|
||||||
|
// We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one.
|
||||||
|
// It's possible that there is a single key with so many page versions that storing all of them in a single layer file
|
||||||
|
// would be too large. In that case, we also split on the LSN dimension.
|
||||||
|
//
|
||||||
|
// LSN
|
||||||
|
// ^
|
||||||
|
// |
|
||||||
|
// | +-----------+ +--+--+--+--+
|
||||||
|
// | | | | | | | |
|
||||||
|
// | +-----------+ | | | | |
|
||||||
|
// | | | | | | | |
|
||||||
|
// | +-----------+ ==> | | | | |
|
||||||
|
// | | | | | | | |
|
||||||
|
// | +-----------+ | | | | |
|
||||||
|
// | | | | | | | |
|
||||||
|
// | +-----------+ +--+--+--+--+
|
||||||
|
// |
|
||||||
|
// +--------------> key
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// If one key (X) has a lot of page versions:
|
||||||
|
//
|
||||||
|
// LSN
|
||||||
|
// ^
|
||||||
|
// | (X)
|
||||||
|
// | +-----------+ +--+--+--+--+
|
||||||
|
// | | | | | | | |
|
||||||
|
// | +-----------+ | | +--+ |
|
||||||
|
// | | | | | | | |
|
||||||
|
// | +-----------+ ==> | | | | |
|
||||||
|
// | | | | | +--+ |
|
||||||
|
// | +-----------+ | | | | |
|
||||||
|
// | | | | | | | |
|
||||||
|
// | +-----------+ +--+--+--+--+
|
||||||
|
// |
|
||||||
|
// +--------------> key
|
||||||
|
// TODO: this actually divides the layers into fixed-size chunks, not
|
||||||
|
// based on the partitioning.
|
||||||
|
//
|
||||||
|
// TODO: we should also opportunistically materialize and
|
||||||
|
// garbage collect what we can.
|
||||||
let mut new_layers = Vec::new();
|
let mut new_layers = Vec::new();
|
||||||
let mut last_key: Option<Key> = None;
|
let mut prev_key: Option<Key> = None;
|
||||||
if let Some(last_delta_layer) = latest_delta_layer {
|
let mut writer: Option<DeltaLayerWriter> = None;
|
||||||
let end_lsn = last_delta_layer.get_lsn_range().end;
|
let mut key_values_total_size = 0u64;
|
||||||
let lsn_range = end_lsn..end_lsn + 1;
|
let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
|
||||||
for (key, lsn, _) in last_delta_layer.key_iter(true) {
|
let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
|
||||||
let value = self.get(key, lsn)?;
|
for x in all_values_iter {
|
||||||
if let Some(curr_writer) = &writer {
|
let (key, lsn, value) = x?;
|
||||||
if curr_writer.size() > target_file_size {
|
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||||
new_layers.push(writer.take().unwrap().finish(key)?);
|
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||||
|
if !same_key || lsn == dup_end_lsn {
|
||||||
|
let mut next_key_size = 0u64;
|
||||||
|
let is_dup_layer = dup_end_lsn.is_valid();
|
||||||
|
dup_start_lsn = Lsn::INVALID;
|
||||||
|
if !same_key {
|
||||||
|
dup_end_lsn = Lsn::INVALID;
|
||||||
|
}
|
||||||
|
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
|
||||||
|
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
|
||||||
|
next_key_size = next_size;
|
||||||
|
if key != next_key {
|
||||||
|
if dup_end_lsn.is_valid() {
|
||||||
|
// We are writting segment with duplicates:
|
||||||
|
// place all remaining values of this key in separate segment
|
||||||
|
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
|
||||||
|
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
key_values_total_size += next_size;
|
||||||
|
// Check if it is time to split segment: if total keys size is larger than target file size.
|
||||||
|
// We need to avoid generation of empty segments if next_size > target_file_size.
|
||||||
|
if key_values_total_size > target_file_size && lsn != next_lsn {
|
||||||
|
// Split key between multiple layers: such layer can contain only single key
|
||||||
|
dup_start_lsn = if dup_end_lsn.is_valid() {
|
||||||
|
dup_end_lsn // new segment with duplicates starts where old one stops
|
||||||
|
} else {
|
||||||
|
lsn // start with the first LSN for this key
|
||||||
|
};
|
||||||
|
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
|
||||||
|
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
|
||||||
|
dup_start_lsn = dup_end_lsn;
|
||||||
|
dup_end_lsn = lsn_range.end;
|
||||||
|
}
|
||||||
|
if writer.is_some() {
|
||||||
|
let written_size = writer.as_mut().unwrap().size();
|
||||||
|
// check if key cause layer overflow...
|
||||||
|
if is_dup_layer
|
||||||
|
|| dup_end_lsn.is_valid()
|
||||||
|
|| written_size + key_values_total_size > target_file_size
|
||||||
|
{
|
||||||
|
// ... if so, flush previous layer and prepare to write new one
|
||||||
|
new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?);
|
||||||
writer = None;
|
writer = None;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Create writer if not initiaized yet
|
// Remember size of key value because at next iteration we will access next item
|
||||||
if writer.is_none() {
|
key_values_total_size = next_key_size;
|
||||||
writer = Some(DeltaLayerWriter::new(
|
|
||||||
self.conf,
|
|
||||||
self.timeline_id,
|
|
||||||
self.tenant_id,
|
|
||||||
key,
|
|
||||||
lsn_range.clone(),
|
|
||||||
)?);
|
|
||||||
}
|
|
||||||
writer
|
|
||||||
.as_mut()
|
|
||||||
.unwrap()
|
|
||||||
.put_value(key, end_lsn, Value::Image(value))?;
|
|
||||||
last_key = Some(key);
|
|
||||||
}
|
}
|
||||||
|
if writer.is_none() {
|
||||||
|
// Create writer if not initiaized yet
|
||||||
|
writer = Some(DeltaLayerWriter::new(
|
||||||
|
self.conf,
|
||||||
|
self.timeline_id,
|
||||||
|
self.tenant_id,
|
||||||
|
key,
|
||||||
|
if dup_end_lsn.is_valid() {
|
||||||
|
// this is a layer containing slice of values of the same key
|
||||||
|
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
|
||||||
|
dup_start_lsn..dup_end_lsn
|
||||||
|
} else {
|
||||||
|
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||||
|
lsn_range.clone()
|
||||||
|
},
|
||||||
|
)?);
|
||||||
|
}
|
||||||
|
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||||
|
prev_key = Some(key);
|
||||||
}
|
}
|
||||||
if let Some(writer) = writer {
|
if let Some(writer) = writer {
|
||||||
new_layers.push(writer.finish(last_key.unwrap().next())?);
|
new_layers.push(writer.finish(prev_key.unwrap().next())?);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sync layers
|
// Sync layers
|
||||||
@@ -1583,18 +1784,35 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let mut layers = self.layers.write().unwrap();
|
let mut layers = self.layers.write().unwrap();
|
||||||
let mut new_layer_paths = HashSet::with_capacity(new_layers.len());
|
let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
|
||||||
for l in new_layers {
|
for l in new_layers {
|
||||||
let new_delta_path = l.path();
|
let new_delta_path = l.path();
|
||||||
|
|
||||||
// update the timeline's physical size
|
let metadata = new_delta_path.metadata()?;
|
||||||
self.metrics
|
|
||||||
.current_physical_size_gauge
|
|
||||||
.add(new_delta_path.metadata()?.len());
|
|
||||||
|
|
||||||
new_layer_paths.insert(new_delta_path);
|
// update the timeline's physical size
|
||||||
|
self.metrics.current_physical_size_gauge.add(metadata.len());
|
||||||
|
|
||||||
|
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
|
||||||
layers.insert_historic(Arc::new(l));
|
layers.insert_historic(Arc::new(l));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Now that we have reshuffled the data to set of new delta layers, we can
|
||||||
|
// delete the old ones
|
||||||
|
let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len());
|
||||||
|
drop(all_keys_iter);
|
||||||
|
for l in deltas_to_compact {
|
||||||
|
if let Some(path) = l.local_path() {
|
||||||
|
self.metrics
|
||||||
|
.current_physical_size_gauge
|
||||||
|
.sub(path.metadata()?.len());
|
||||||
|
layer_paths_do_delete.insert(path);
|
||||||
|
}
|
||||||
|
l.delete()?;
|
||||||
|
layers.remove_historic(l);
|
||||||
|
}
|
||||||
|
drop(layers);
|
||||||
|
|
||||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||||
storage_sync::schedule_layer_upload(
|
storage_sync::schedule_layer_upload(
|
||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
@@ -1602,6 +1820,11 @@ impl Timeline {
|
|||||||
new_layer_paths,
|
new_layer_paths,
|
||||||
None,
|
None,
|
||||||
);
|
);
|
||||||
|
storage_sync::schedule_layer_delete(
|
||||||
|
self.tenant_id,
|
||||||
|
self.timeline_id,
|
||||||
|
layer_paths_do_delete,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -1609,10 +1832,10 @@ impl Timeline {
|
|||||||
|
|
||||||
/// Update information about which layer files need to be retained on
|
/// Update information about which layer files need to be retained on
|
||||||
/// garbage collection. This is separate from actually performing the GC,
|
/// garbage collection. This is separate from actually performing the GC,
|
||||||
/// and is updated more frequently, so that reconstruction can remove obsolete
|
/// and is updated more frequently, so that compaction can remove obsolete
|
||||||
/// page versions more aggressively.
|
/// page versions more aggressively.
|
||||||
///
|
///
|
||||||
/// TODO: that's wishful thinking, reconstruction doesn't actually do that
|
/// TODO: that's wishful thinking, compaction doesn't actually do that
|
||||||
/// currently.
|
/// currently.
|
||||||
///
|
///
|
||||||
/// The caller specifies how much history is needed with the 3 arguments:
|
/// The caller specifies how much history is needed with the 3 arguments:
|
||||||
@@ -1736,6 +1959,9 @@ impl Timeline {
|
|||||||
new_gc_cutoff
|
new_gc_cutoff
|
||||||
);
|
);
|
||||||
write_guard.store_and_unlock(new_gc_cutoff).wait();
|
write_guard.store_and_unlock(new_gc_cutoff).wait();
|
||||||
|
|
||||||
|
// Persist metadata file
|
||||||
|
self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("GC starting");
|
info!("GC starting");
|
||||||
@@ -1862,6 +2088,18 @@ impl Timeline {
|
|||||||
result.layers_removed += 1;
|
result.layers_removed += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"GC completed removing {} layers, cuttof {}",
|
||||||
|
result.layers_removed, new_gc_cutoff
|
||||||
|
);
|
||||||
|
if result.layers_removed != 0 {
|
||||||
|
fail_point!("gc-before-save-metadata", |_| {
|
||||||
|
info!("Abnormaly terinate pageserver at gc-before-save-metadata fail point");
|
||||||
|
std::process::abort();
|
||||||
|
});
|
||||||
|
return Ok(result);
|
||||||
|
}
|
||||||
|
|
||||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||||
storage_sync::schedule_layer_delete(
|
storage_sync::schedule_layer_delete(
|
||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ pub mod defaults {
|
|||||||
// This parameter determines L1 layer file size.
|
// This parameter determines L1 layer file size.
|
||||||
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
||||||
|
|
||||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s";
|
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
|
||||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||||
|
|
||||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
//! This module acts as a switchboard to access different repositories managed by this
|
//! This module acts as a switchboard to access different repositories managed by this
|
||||||
//! page server.
|
//! page server.
|
||||||
|
|
||||||
use std::collections::{hash_map, HashMap, HashSet};
|
use std::collections::{hash_map, HashMap};
|
||||||
use std::ffi::OsStr;
|
use std::ffi::OsStr;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
@@ -14,15 +14,15 @@ use remote_storage::GenericRemoteStorage;
|
|||||||
|
|
||||||
use crate::config::{PageServerConf, METADATA_FILE_NAME};
|
use crate::config::{PageServerConf, METADATA_FILE_NAME};
|
||||||
use crate::http::models::TenantInfo;
|
use crate::http::models::TenantInfo;
|
||||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex};
|
use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex};
|
||||||
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
|
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles};
|
||||||
use crate::task_mgr::{self, TaskKind};
|
use crate::task_mgr::{self, TaskKind};
|
||||||
use crate::tenant::{
|
use crate::tenant::{
|
||||||
ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState,
|
ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState,
|
||||||
};
|
};
|
||||||
use crate::tenant_config::TenantConfOpt;
|
use crate::tenant_config::TenantConfOpt;
|
||||||
use crate::walredo::PostgresRedoManager;
|
use crate::walredo::PostgresRedoManager;
|
||||||
use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX};
|
use crate::TEMP_FILE_SUFFIX;
|
||||||
|
|
||||||
use utils::crashsafe_dir::{self, path_with_suffix_extension};
|
use utils::crashsafe_dir::{self, path_with_suffix_extension};
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
@@ -70,34 +70,54 @@ pub fn init_tenant_mgr(
|
|||||||
.remote_storage_config
|
.remote_storage_config
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.expect("remote storage without config");
|
.expect("remote storage without config");
|
||||||
|
let mut broken_tenants = HashMap::new();
|
||||||
|
let mut ready_tenants = HashMap::new();
|
||||||
|
for (tenant_id, tenant_attach_data) in local_tenant_files.into_iter() {
|
||||||
|
match tenant_attach_data {
|
||||||
|
TenantAttachData::Ready(t) => {
|
||||||
|
ready_tenants.insert(tenant_id, t);
|
||||||
|
}
|
||||||
|
TenantAttachData::Broken(e) => {
|
||||||
|
broken_tenants.insert(tenant_id, TenantAttachData::Broken(e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
let SyncStartupData {
|
let SyncStartupData {
|
||||||
remote_index,
|
remote_index,
|
||||||
local_timeline_init_statuses,
|
local_timeline_init_statuses,
|
||||||
} = storage_sync::spawn_storage_sync_task(
|
} = storage_sync::spawn_storage_sync_task(
|
||||||
conf,
|
conf,
|
||||||
local_tenant_files,
|
ready_tenants,
|
||||||
storage,
|
storage,
|
||||||
storage_config.max_concurrent_syncs,
|
storage_config.max_concurrent_syncs,
|
||||||
storage_config.max_sync_errors,
|
storage_config.max_sync_errors,
|
||||||
)
|
)
|
||||||
.context("Failed to spawn the storage sync thread")?;
|
.context("Failed to spawn the storage sync thread")?;
|
||||||
|
|
||||||
(
|
let n = local_timeline_init_statuses.0.len();
|
||||||
remote_index,
|
let mut synced_timelines = local_timeline_init_statuses.0.into_iter().fold(
|
||||||
local_timeline_init_statuses.filter_map(|init_status| match init_status {
|
HashMap::<TenantId, TenantAttachData>::with_capacity(n),
|
||||||
LocalTimelineInitStatus::LocallyComplete(metadata) => Some(metadata),
|
|mut new_values, (tenant_id, old_values)| {
|
||||||
LocalTimelineInitStatus::NeedsSync => None,
|
let new_timeline_values = new_values
|
||||||
}),
|
.entry(tenant_id)
|
||||||
)
|
.or_insert_with(|| TenantAttachData::Ready(HashMap::new()));
|
||||||
|
if let TenantAttachData::Ready(t) = new_timeline_values {
|
||||||
|
for (timeline_id, old_value) in old_values {
|
||||||
|
if let LocalTimelineInitStatus::LocallyComplete(metadata) = old_value {
|
||||||
|
t.insert(timeline_id, TimelineLocalFiles::ready(metadata));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
new_values
|
||||||
|
},
|
||||||
|
);
|
||||||
|
synced_timelines.extend(broken_tenants);
|
||||||
|
|
||||||
|
(remote_index, synced_timelines)
|
||||||
} else {
|
} else {
|
||||||
info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
|
info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
|
||||||
(
|
(RemoteIndex::default(), local_tenant_files)
|
||||||
RemoteIndex::default(),
|
|
||||||
local_tenant_files.filter_map(|(metadata, _)| Some(metadata)),
|
|
||||||
)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
attach_local_tenants(conf, &remote_index, tenants_to_attach);
|
attach_local_tenants(conf, &remote_index, tenants_to_attach);
|
||||||
|
|
||||||
Ok(remote_index)
|
Ok(remote_index)
|
||||||
@@ -108,23 +128,21 @@ pub fn init_tenant_mgr(
|
|||||||
/// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken",
|
/// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken",
|
||||||
/// and the load continues.
|
/// and the load continues.
|
||||||
///
|
///
|
||||||
|
/// For successful tenant attach, it first has to have a `timelines/` subdirectory and a tenant config file that's loaded into memory successfully.
|
||||||
|
/// If either of the conditions fails, the tenant will be added to memory with [`TenantState::Broken`] state, otherwise we start to load its timelines.
|
||||||
|
/// Alternatively, tenant is considered loaded successfully, if it's already in pageserver's memory (i.e. was loaded already before).
|
||||||
|
///
|
||||||
/// Attach happens on startup and sucessful timeline downloads
|
/// Attach happens on startup and sucessful timeline downloads
|
||||||
/// (some subset of timeline files, always including its metadata, after which the new one needs to be registered).
|
/// (some subset of timeline files, always including its metadata, after which the new one needs to be registered).
|
||||||
pub fn attach_local_tenants(
|
pub fn attach_local_tenants(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
remote_index: &RemoteIndex,
|
remote_index: &RemoteIndex,
|
||||||
tenants_to_attach: TenantTimelineValues<TimelineMetadata>,
|
tenants_to_attach: HashMap<TenantId, TenantAttachData>,
|
||||||
) {
|
) {
|
||||||
let _entered = info_span!("attach_local_tenants").entered();
|
let _entered = info_span!("attach_local_tenants").entered();
|
||||||
let number_of_tenants = tenants_to_attach.0.len();
|
let number_of_tenants = tenants_to_attach.len();
|
||||||
|
|
||||||
for (tenant_id, local_timelines) in tenants_to_attach.0 {
|
|
||||||
info!(
|
|
||||||
"Attaching {} timelines for {tenant_id}",
|
|
||||||
local_timelines.len()
|
|
||||||
);
|
|
||||||
debug!("Timelines to attach: {local_timelines:?}");
|
|
||||||
|
|
||||||
|
for (tenant_id, local_timelines) in tenants_to_attach {
|
||||||
let mut tenants_accessor = tenants_state::write_tenants();
|
let mut tenants_accessor = tenants_state::write_tenants();
|
||||||
let tenant = match tenants_accessor.entry(tenant_id) {
|
let tenant = match tenants_accessor.entry(tenant_id) {
|
||||||
hash_map::Entry::Occupied(o) => {
|
hash_map::Entry::Occupied(o) => {
|
||||||
@@ -133,25 +151,55 @@ pub fn attach_local_tenants(
|
|||||||
}
|
}
|
||||||
hash_map::Entry::Vacant(v) => {
|
hash_map::Entry::Vacant(v) => {
|
||||||
info!("Tenant {tenant_id} was not found in pageserver's memory, loading it");
|
info!("Tenant {tenant_id} was not found in pageserver's memory, loading it");
|
||||||
let tenant = load_local_tenant(conf, tenant_id, remote_index);
|
let tenant = Arc::new(Tenant::new(
|
||||||
|
conf,
|
||||||
|
TenantConfOpt::default(),
|
||||||
|
Arc::new(PostgresRedoManager::new(conf, tenant_id)),
|
||||||
|
tenant_id,
|
||||||
|
remote_index.clone(),
|
||||||
|
conf.remote_storage_config.is_some(),
|
||||||
|
));
|
||||||
|
match local_timelines {
|
||||||
|
TenantAttachData::Broken(_) => {
|
||||||
|
tenant.set_state(TenantState::Broken);
|
||||||
|
}
|
||||||
|
TenantAttachData::Ready(_) => {
|
||||||
|
match Tenant::load_tenant_config(conf, tenant_id) {
|
||||||
|
Ok(tenant_conf) => {
|
||||||
|
tenant.update_tenant_config(tenant_conf);
|
||||||
|
tenant.activate(false);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}");
|
||||||
|
tenant.set_state(TenantState::Broken);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
v.insert(Arc::clone(&tenant));
|
v.insert(Arc::clone(&tenant));
|
||||||
tenant
|
tenant
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
drop(tenants_accessor);
|
drop(tenants_accessor);
|
||||||
|
match local_timelines {
|
||||||
if tenant.current_state() == TenantState::Broken {
|
TenantAttachData::Broken(e) => warn!("{}", e),
|
||||||
warn!("Skipping timeline load for broken tenant {tenant_id}")
|
TenantAttachData::Ready(ref timelines) => {
|
||||||
} else {
|
info!("Attaching {} timelines for {tenant_id}", timelines.len());
|
||||||
let has_timelines = !local_timelines.is_empty();
|
debug!("Timelines to attach: {local_timelines:?}");
|
||||||
match tenant.init_attach_timelines(local_timelines) {
|
let has_timelines = !timelines.is_empty();
|
||||||
Ok(()) => {
|
let timelines_to_attach = timelines
|
||||||
info!("successfully loaded local timelines for tenant {tenant_id}");
|
.iter()
|
||||||
tenant.activate(has_timelines);
|
.map(|(&k, v)| (k, v.metadata().to_owned()))
|
||||||
}
|
.collect();
|
||||||
Err(e) => {
|
match tenant.init_attach_timelines(timelines_to_attach) {
|
||||||
error!("Failed to attach tenant timelines: {e:?}");
|
Ok(()) => {
|
||||||
tenant.set_state(TenantState::Broken);
|
info!("successfully loaded local timelines for tenant {tenant_id}");
|
||||||
|
tenant.activate(has_timelines);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to attach tenant timelines: {e:?}");
|
||||||
|
tenant.set_state(TenantState::Broken);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -160,32 +208,6 @@ pub fn attach_local_tenants(
|
|||||||
info!("Processed {number_of_tenants} local tenants during attach")
|
info!("Processed {number_of_tenants} local tenants during attach")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn load_local_tenant(
|
|
||||||
conf: &'static PageServerConf,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
remote_index: &RemoteIndex,
|
|
||||||
) -> Arc<Tenant> {
|
|
||||||
let tenant = Arc::new(Tenant::new(
|
|
||||||
conf,
|
|
||||||
TenantConfOpt::default(),
|
|
||||||
Arc::new(PostgresRedoManager::new(conf, tenant_id)),
|
|
||||||
tenant_id,
|
|
||||||
remote_index.clone(),
|
|
||||||
conf.remote_storage_config.is_some(),
|
|
||||||
));
|
|
||||||
match Tenant::load_tenant_config(conf, tenant_id) {
|
|
||||||
Ok(tenant_conf) => {
|
|
||||||
tenant.update_tenant_config(tenant_conf);
|
|
||||||
tenant.activate(false);
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}");
|
|
||||||
tenant.set_state(TenantState::Broken);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tenant
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
||||||
///
|
///
|
||||||
@@ -459,16 +481,21 @@ pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec<TenantInfo> {
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum TenantAttachData {
|
||||||
|
Ready(HashMap<TimelineId, TimelineLocalFiles>),
|
||||||
|
Broken(anyhow::Error),
|
||||||
|
}
|
||||||
/// Attempts to collect information about all tenant and timelines, existing on the local FS.
|
/// Attempts to collect information about all tenant and timelines, existing on the local FS.
|
||||||
/// If finds any, deletes all temporary files and directories, created before. Also removes empty directories,
|
/// If finds any, deletes all temporary files and directories, created before. Also removes empty directories,
|
||||||
/// that may appear due to such removals.
|
/// that may appear due to such removals.
|
||||||
/// Does not fail on particular timeline or tenant collection errors, rather logging them and ignoring the entities.
|
/// Does not fail on particular timeline or tenant collection errors, rather logging them and ignoring the entities.
|
||||||
fn local_tenant_timeline_files(
|
fn local_tenant_timeline_files(
|
||||||
config: &'static PageServerConf,
|
config: &'static PageServerConf,
|
||||||
) -> anyhow::Result<TenantTimelineValues<(TimelineMetadata, HashSet<PathBuf>)>> {
|
) -> anyhow::Result<HashMap<TenantId, TenantAttachData>> {
|
||||||
let _entered = info_span!("local_tenant_timeline_files").entered();
|
let _entered = info_span!("local_tenant_timeline_files").entered();
|
||||||
|
|
||||||
let mut local_tenant_timeline_files = TenantTimelineValues::new();
|
let mut local_tenant_timeline_files = HashMap::new();
|
||||||
let tenants_dir = config.tenants_path();
|
let tenants_dir = config.tenants_path();
|
||||||
for tenants_dir_entry in fs::read_dir(&tenants_dir)
|
for tenants_dir_entry in fs::read_dir(&tenants_dir)
|
||||||
.with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
|
.with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
|
||||||
@@ -490,19 +517,31 @@ fn local_tenant_timeline_files(
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
match collect_timelines_for_tenant(config, &tenant_dir_path) {
|
match collect_timelines_for_tenant(config, &tenant_dir_path) {
|
||||||
Ok((tenant_id, collected_files)) => {
|
Ok((tenant_id, TenantAttachData::Broken(e))) => {
|
||||||
|
local_tenant_timeline_files.entry(tenant_id).or_insert(TenantAttachData::Broken(e));
|
||||||
|
},
|
||||||
|
Ok((tenant_id, TenantAttachData::Ready(collected_files))) => {
|
||||||
if collected_files.is_empty() {
|
if collected_files.is_empty() {
|
||||||
match remove_if_empty(&tenant_dir_path) {
|
match remove_if_empty(&tenant_dir_path) {
|
||||||
Ok(true) => info!("Removed empty tenant directory {}", tenant_dir_path.display()),
|
Ok(true) => info!("Removed empty tenant directory {}", tenant_dir_path.display()),
|
||||||
Ok(false) => {
|
Ok(false) => {
|
||||||
// insert empty timeline entry: it has some non-temporary files inside that we cannot remove
|
// insert empty timeline entry: it has some non-temporary files inside that we cannot remove
|
||||||
// so make obvious for HTTP API callers, that something exists there and try to load the tenant
|
// so make obvious for HTTP API callers, that something exists there and try to load the tenant
|
||||||
let _ = local_tenant_timeline_files.0.entry(tenant_id).or_default();
|
let _ = local_tenant_timeline_files.entry(tenant_id).or_insert_with(|| TenantAttachData::Ready(HashMap::new()));
|
||||||
},
|
},
|
||||||
Err(e) => error!("Failed to remove empty tenant directory: {e:?}"),
|
Err(e) => error!("Failed to remove empty tenant directory: {e:?}"),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
local_tenant_timeline_files.0.entry(tenant_id).or_default().extend(collected_files.into_iter())
|
match local_tenant_timeline_files.entry(tenant_id) {
|
||||||
|
hash_map::Entry::Vacant(entry) => {
|
||||||
|
entry.insert(TenantAttachData::Ready(collected_files));
|
||||||
|
}
|
||||||
|
hash_map::Entry::Occupied(entry) =>{
|
||||||
|
if let TenantAttachData::Ready(old_timelines) = entry.into_mut() {
|
||||||
|
old_timelines.extend(collected_files);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
Err(e) => error!(
|
Err(e) => error!(
|
||||||
@@ -525,7 +564,7 @@ fn local_tenant_timeline_files(
|
|||||||
|
|
||||||
info!(
|
info!(
|
||||||
"Collected files for {} tenants",
|
"Collected files for {} tenants",
|
||||||
local_tenant_timeline_files.0.len()
|
local_tenant_timeline_files.len(),
|
||||||
);
|
);
|
||||||
Ok(local_tenant_timeline_files)
|
Ok(local_tenant_timeline_files)
|
||||||
}
|
}
|
||||||
@@ -563,14 +602,10 @@ fn is_temporary(path: &Path) -> bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::type_complexity)]
|
|
||||||
fn collect_timelines_for_tenant(
|
fn collect_timelines_for_tenant(
|
||||||
config: &'static PageServerConf,
|
config: &'static PageServerConf,
|
||||||
tenant_path: &Path,
|
tenant_path: &Path,
|
||||||
) -> anyhow::Result<(
|
) -> anyhow::Result<(TenantId, TenantAttachData)> {
|
||||||
TenantId,
|
|
||||||
HashMap<TimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
|
|
||||||
)> {
|
|
||||||
let tenant_id = tenant_path
|
let tenant_id = tenant_path
|
||||||
.file_name()
|
.file_name()
|
||||||
.and_then(OsStr::to_str)
|
.and_then(OsStr::to_str)
|
||||||
@@ -579,6 +614,17 @@ fn collect_timelines_for_tenant(
|
|||||||
.context("Could not parse tenant id out of the tenant dir name")?;
|
.context("Could not parse tenant id out of the tenant dir name")?;
|
||||||
let timelines_dir = config.timelines_path(&tenant_id);
|
let timelines_dir = config.timelines_path(&tenant_id);
|
||||||
|
|
||||||
|
if !timelines_dir.as_path().is_dir() {
|
||||||
|
return Ok((
|
||||||
|
tenant_id,
|
||||||
|
TenantAttachData::Broken(anyhow::anyhow!(
|
||||||
|
"Tenant {} has no timelines directory at {}",
|
||||||
|
tenant_id,
|
||||||
|
timelines_dir.display()
|
||||||
|
)),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
let mut tenant_timelines = HashMap::new();
|
let mut tenant_timelines = HashMap::new();
|
||||||
for timelines_dir_entry in fs::read_dir(&timelines_dir)
|
for timelines_dir_entry in fs::read_dir(&timelines_dir)
|
||||||
.with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))?
|
.with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))?
|
||||||
@@ -601,7 +647,10 @@ fn collect_timelines_for_tenant(
|
|||||||
} else {
|
} else {
|
||||||
match collect_timeline_files(&timeline_dir) {
|
match collect_timeline_files(&timeline_dir) {
|
||||||
Ok((timeline_id, metadata, timeline_files)) => {
|
Ok((timeline_id, metadata, timeline_files)) => {
|
||||||
tenant_timelines.insert(timeline_id, (metadata, timeline_files));
|
tenant_timelines.insert(
|
||||||
|
timeline_id,
|
||||||
|
TimelineLocalFiles::collected(metadata, timeline_files),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!(
|
error!(
|
||||||
@@ -630,25 +679,25 @@ fn collect_timelines_for_tenant(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if tenant_timelines.is_empty() {
|
if tenant_timelines.is_empty() {
|
||||||
match remove_if_empty(&timelines_dir) {
|
// this is normal, we've removed all broken, empty and temporary timeline dirs
|
||||||
Ok(true) => info!(
|
// but should allow the tenant to stay functional and allow creating new timelines
|
||||||
"Removed empty tenant timelines directory {}",
|
// on a restart, we require tenants to have the timelines dir, so leave it on disk
|
||||||
timelines_dir.display()
|
debug!("Tenant {tenant_id} has no timelines loaded");
|
||||||
),
|
|
||||||
Ok(false) => (),
|
|
||||||
Err(e) => error!("Failed to remove empty tenant timelines directory: {e:?}"),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((tenant_id, tenant_timelines))
|
Ok((tenant_id, TenantAttachData::Ready(tenant_timelines)))
|
||||||
}
|
}
|
||||||
|
|
||||||
// discover timeline files and extract timeline metadata
|
// discover timeline files and extract timeline metadata
|
||||||
// NOTE: ephemeral files are excluded from the list
|
// NOTE: ephemeral files are excluded from the list
|
||||||
fn collect_timeline_files(
|
fn collect_timeline_files(
|
||||||
timeline_dir: &Path,
|
timeline_dir: &Path,
|
||||||
) -> anyhow::Result<(TimelineId, TimelineMetadata, HashSet<PathBuf>)> {
|
) -> anyhow::Result<(
|
||||||
let mut timeline_files = HashSet::new();
|
TimelineId,
|
||||||
|
TimelineMetadata,
|
||||||
|
HashMap<PathBuf, LayerFileMetadata>,
|
||||||
|
)> {
|
||||||
|
let mut timeline_files = HashMap::new();
|
||||||
let mut timeline_metadata_path = None;
|
let mut timeline_metadata_path = None;
|
||||||
|
|
||||||
let timeline_id = timeline_dir
|
let timeline_id = timeline_dir
|
||||||
@@ -661,7 +710,9 @@ fn collect_timeline_files(
|
|||||||
fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
|
fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
|
||||||
for entry in timeline_dir_entries {
|
for entry in timeline_dir_entries {
|
||||||
let entry_path = entry.context("Failed to list timeline dir entry")?.path();
|
let entry_path = entry.context("Failed to list timeline dir entry")?.path();
|
||||||
if entry_path.is_file() {
|
let metadata = entry_path.metadata()?;
|
||||||
|
|
||||||
|
if metadata.is_file() {
|
||||||
if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) {
|
if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) {
|
||||||
timeline_metadata_path = Some(entry_path);
|
timeline_metadata_path = Some(entry_path);
|
||||||
} else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
|
} else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
|
||||||
@@ -676,7 +727,8 @@ fn collect_timeline_files(
|
|||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
} else {
|
} else {
|
||||||
timeline_files.insert(entry_path);
|
let layer_metadata = LayerFileMetadata::new(metadata.len());
|
||||||
|
timeline_files.insert(entry_path, layer_metadata);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -70,8 +70,10 @@ async fn compaction_loop(tenant_id: TenantId) {
|
|||||||
// Run compaction
|
// Run compaction
|
||||||
let mut sleep_duration = tenant.get_compaction_period();
|
let mut sleep_duration = tenant.get_compaction_period();
|
||||||
if let Err(e) = tenant.compaction_iteration() {
|
if let Err(e) = tenant.compaction_iteration() {
|
||||||
error!("Compaction failed, retrying: {e:#}");
|
|
||||||
sleep_duration = wait_duration;
|
sleep_duration = wait_duration;
|
||||||
|
error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
std::process::abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sleep
|
// Sleep
|
||||||
@@ -119,8 +121,10 @@ async fn gc_loop(tenant_id: TenantId) {
|
|||||||
if gc_horizon > 0 {
|
if gc_horizon > 0 {
|
||||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false)
|
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false)
|
||||||
{
|
{
|
||||||
error!("Gc failed, retrying: {e:#}");
|
|
||||||
sleep_duration = wait_duration;
|
sleep_duration = wait_duration;
|
||||||
|
error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
std::process::abort();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ use chrono::{NaiveDateTime, Utc};
|
|||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use postgres::{SimpleQueryMessage, SimpleQueryRow};
|
use postgres::{SimpleQueryMessage, SimpleQueryRow};
|
||||||
|
use postgres_ffi::v14::xlog_utils::normalize_lsn;
|
||||||
|
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||||
use postgres_protocol::message::backend::ReplicationMessage;
|
use postgres_protocol::message::backend::ReplicationMessage;
|
||||||
use postgres_types::PgLsn;
|
use postgres_types::PgLsn;
|
||||||
use tokio::{pin, select, sync::watch, time};
|
use tokio::{pin, select, sync::watch, time};
|
||||||
@@ -156,6 +158,14 @@ pub async fn handle_walreceiver_connection(
|
|||||||
// There might be some padding after the last full record, skip it.
|
// There might be some padding after the last full record, skip it.
|
||||||
startpoint += startpoint.calc_padding(8u32);
|
startpoint += startpoint.calc_padding(8u32);
|
||||||
|
|
||||||
|
// If the starting point is at a WAL page boundary, skip past the page header. We don't need the page headers
|
||||||
|
// for anything, and in some corner cases, the compute node might have never generated the WAL for page headers
|
||||||
|
//. That happens if you create a branch at page boundary: the start point of the branch is at the page boundary,
|
||||||
|
// but when the compute node first starts on the branch, we normalize the first REDO position to just after the page
|
||||||
|
// header (see generate_pg_control()), so the WAL for the page header is never streamed from the compute node
|
||||||
|
// to the safekeepers.
|
||||||
|
startpoint = normalize_lsn(startpoint, WAL_SEGMENT_SIZE);
|
||||||
|
|
||||||
info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}...");
|
info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}...");
|
||||||
|
|
||||||
let query = format!("START_REPLICATION PHYSICAL {startpoint}");
|
let query = format!("START_REPLICATION PHYSICAL {startpoint}");
|
||||||
|
|||||||
@@ -39,7 +39,8 @@ use utils::crashsafe_dir::path_with_suffix_extension;
|
|||||||
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
|
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
|
||||||
|
|
||||||
use crate::metrics::{
|
use crate::metrics::{
|
||||||
WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME,
|
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
|
||||||
|
WAL_REDO_WAIT_TIME,
|
||||||
};
|
};
|
||||||
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
||||||
use crate::reltag::{RelTag, SlruKind};
|
use crate::reltag::{RelTag, SlruKind};
|
||||||
@@ -244,12 +245,23 @@ impl PostgresRedoManager {
|
|||||||
let end_time = Instant::now();
|
let end_time = Instant::now();
|
||||||
let duration = end_time.duration_since(lock_time);
|
let duration = end_time.duration_since(lock_time);
|
||||||
|
|
||||||
|
let len = records.len();
|
||||||
|
let nbytes = records.iter().fold(0, |acumulator, record| {
|
||||||
|
acumulator
|
||||||
|
+ match &record.1 {
|
||||||
|
NeonWalRecord::Postgres { rec, .. } => rec.len(),
|
||||||
|
_ => unreachable!("Only PostgreSQL records are accepted in this batch"),
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
WAL_REDO_TIME.observe(duration.as_secs_f64());
|
WAL_REDO_TIME.observe(duration.as_secs_f64());
|
||||||
WAL_REDO_RECORDS_HISTOGRAM.observe(records.len() as f64);
|
WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
|
||||||
|
WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
"postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
|
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
|
||||||
records.len(),
|
len,
|
||||||
|
nbytes,
|
||||||
duration.as_micros(),
|
duration.as_micros(),
|
||||||
lsn
|
lsn
|
||||||
);
|
);
|
||||||
@@ -258,8 +270,9 @@ impl PostgresRedoManager {
|
|||||||
// next request will launch a new one.
|
// next request will launch a new one.
|
||||||
if result.is_err() {
|
if result.is_err() {
|
||||||
error!(
|
error!(
|
||||||
"error applying {} WAL records to reconstruct page image at LSN {}",
|
"error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}",
|
||||||
records.len(),
|
records.len(),
|
||||||
|
nbytes,
|
||||||
lsn
|
lsn
|
||||||
);
|
);
|
||||||
let process = process_guard.take().unwrap();
|
let process = process_guard.take().unwrap();
|
||||||
|
|||||||
@@ -10,51 +10,12 @@ struct WalProposerConn
|
|||||||
PGconn *pg_conn;
|
PGconn *pg_conn;
|
||||||
bool is_nonblocking; /* whether the connection is non-blocking */
|
bool is_nonblocking; /* whether the connection is non-blocking */
|
||||||
char *recvbuf; /* last received data from
|
char *recvbuf; /* last received data from
|
||||||
* libpqprop_async_read */
|
* walprop_async_read */
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Prototypes for exported functions */
|
|
||||||
static char *libpqprop_error_message(WalProposerConn * conn);
|
|
||||||
static WalProposerConnStatusType libpqprop_status(WalProposerConn * conn);
|
|
||||||
static WalProposerConn * libpqprop_connect_start(char *conninfo);
|
|
||||||
static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn * conn);
|
|
||||||
static bool libpqprop_send_query(WalProposerConn * conn, char *query);
|
|
||||||
static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn * conn);
|
|
||||||
static pgsocket libpqprop_socket(WalProposerConn * conn);
|
|
||||||
static int libpqprop_flush(WalProposerConn * conn);
|
|
||||||
static void libpqprop_finish(WalProposerConn * conn);
|
|
||||||
static PGAsyncReadResult libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount);
|
|
||||||
static PGAsyncWriteResult libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size);
|
|
||||||
static bool libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size);
|
|
||||||
|
|
||||||
static WalProposerFunctionsType PQWalProposerFunctions =
|
|
||||||
{
|
|
||||||
libpqprop_error_message,
|
|
||||||
libpqprop_status,
|
|
||||||
libpqprop_connect_start,
|
|
||||||
libpqprop_connect_poll,
|
|
||||||
libpqprop_send_query,
|
|
||||||
libpqprop_get_query_result,
|
|
||||||
libpqprop_socket,
|
|
||||||
libpqprop_flush,
|
|
||||||
libpqprop_finish,
|
|
||||||
libpqprop_async_read,
|
|
||||||
libpqprop_async_write,
|
|
||||||
libpqprop_blocking_write,
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Module initialization */
|
|
||||||
void
|
|
||||||
pg_init_libpqwalproposer(void)
|
|
||||||
{
|
|
||||||
if (WalProposerFunctions != NULL)
|
|
||||||
elog(ERROR, "libpqwalproposer already loaded");
|
|
||||||
WalProposerFunctions = &PQWalProposerFunctions;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Helper function */
|
/* Helper function */
|
||||||
static bool
|
static bool
|
||||||
ensure_nonblocking_status(WalProposerConn * conn, bool is_nonblocking)
|
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
|
||||||
{
|
{
|
||||||
/* If we're already correctly blocking or nonblocking, all good */
|
/* If we're already correctly blocking or nonblocking, all good */
|
||||||
if (is_nonblocking == conn->is_nonblocking)
|
if (is_nonblocking == conn->is_nonblocking)
|
||||||
@@ -69,14 +30,14 @@ ensure_nonblocking_status(WalProposerConn * conn, bool is_nonblocking)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Exported function definitions */
|
/* Exported function definitions */
|
||||||
static char *
|
char *
|
||||||
libpqprop_error_message(WalProposerConn * conn)
|
walprop_error_message(WalProposerConn *conn)
|
||||||
{
|
{
|
||||||
return PQerrorMessage(conn->pg_conn);
|
return PQerrorMessage(conn->pg_conn);
|
||||||
}
|
}
|
||||||
|
|
||||||
static WalProposerConnStatusType
|
WalProposerConnStatusType
|
||||||
libpqprop_status(WalProposerConn * conn)
|
walprop_status(WalProposerConn *conn)
|
||||||
{
|
{
|
||||||
switch (PQstatus(conn->pg_conn))
|
switch (PQstatus(conn->pg_conn))
|
||||||
{
|
{
|
||||||
@@ -89,8 +50,8 @@ libpqprop_status(WalProposerConn * conn)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static WalProposerConn *
|
WalProposerConn *
|
||||||
libpqprop_connect_start(char *conninfo)
|
walprop_connect_start(char *conninfo)
|
||||||
{
|
{
|
||||||
WalProposerConn *conn;
|
WalProposerConn *conn;
|
||||||
PGconn *pg_conn;
|
PGconn *pg_conn;
|
||||||
@@ -119,8 +80,8 @@ libpqprop_connect_start(char *conninfo)
|
|||||||
return conn;
|
return conn;
|
||||||
}
|
}
|
||||||
|
|
||||||
static WalProposerConnectPollStatusType
|
WalProposerConnectPollStatusType
|
||||||
libpqprop_connect_poll(WalProposerConn * conn)
|
walprop_connect_poll(WalProposerConn *conn)
|
||||||
{
|
{
|
||||||
WalProposerConnectPollStatusType return_val;
|
WalProposerConnectPollStatusType return_val;
|
||||||
|
|
||||||
@@ -160,8 +121,8 @@ libpqprop_connect_poll(WalProposerConn * conn)
|
|||||||
return return_val;
|
return return_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
bool
|
||||||
libpqprop_send_query(WalProposerConn * conn, char *query)
|
walprop_send_query(WalProposerConn *conn, char *query)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* We need to be in blocking mode for sending the query to run without
|
* We need to be in blocking mode for sending the query to run without
|
||||||
@@ -177,8 +138,8 @@ libpqprop_send_query(WalProposerConn * conn, char *query)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static WalProposerExecStatusType
|
WalProposerExecStatusType
|
||||||
libpqprop_get_query_result(WalProposerConn * conn)
|
walprop_get_query_result(WalProposerConn *conn)
|
||||||
{
|
{
|
||||||
PGresult *result;
|
PGresult *result;
|
||||||
WalProposerExecStatusType return_val;
|
WalProposerExecStatusType return_val;
|
||||||
@@ -255,20 +216,20 @@ libpqprop_get_query_result(WalProposerConn * conn)
|
|||||||
return return_val;
|
return return_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
static pgsocket
|
pgsocket
|
||||||
libpqprop_socket(WalProposerConn * conn)
|
walprop_socket(WalProposerConn *conn)
|
||||||
{
|
{
|
||||||
return PQsocket(conn->pg_conn);
|
return PQsocket(conn->pg_conn);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
int
|
||||||
libpqprop_flush(WalProposerConn * conn)
|
walprop_flush(WalProposerConn *conn)
|
||||||
{
|
{
|
||||||
return (PQflush(conn->pg_conn));
|
return (PQflush(conn->pg_conn));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
void
|
||||||
libpqprop_finish(WalProposerConn * conn)
|
walprop_finish(WalProposerConn *conn)
|
||||||
{
|
{
|
||||||
if (conn->recvbuf != NULL)
|
if (conn->recvbuf != NULL)
|
||||||
PQfreemem(conn->recvbuf);
|
PQfreemem(conn->recvbuf);
|
||||||
@@ -282,8 +243,8 @@ libpqprop_finish(WalProposerConn * conn)
|
|||||||
* On success, the data is placed in *buf. It is valid until the next call
|
* On success, the data is placed in *buf. It is valid until the next call
|
||||||
* to this function.
|
* to this function.
|
||||||
*/
|
*/
|
||||||
static PGAsyncReadResult
|
PGAsyncReadResult
|
||||||
libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount)
|
walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||||
{
|
{
|
||||||
int result;
|
int result;
|
||||||
|
|
||||||
@@ -353,8 +314,8 @@ libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static PGAsyncWriteResult
|
PGAsyncWriteResult
|
||||||
libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size)
|
walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||||
{
|
{
|
||||||
int result;
|
int result;
|
||||||
|
|
||||||
@@ -408,8 +369,12 @@ libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
/*
|
||||||
libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size)
|
* This function is very similar to walprop_async_write. For more
|
||||||
|
* information, refer to the comments there.
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||||
{
|
{
|
||||||
int result;
|
int result;
|
||||||
|
|
||||||
@@ -417,10 +382,6 @@ libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size)
|
|||||||
if (!ensure_nonblocking_status(conn, false))
|
if (!ensure_nonblocking_status(conn, false))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
/*
|
|
||||||
* Ths function is very similar to libpqprop_async_write. For more
|
|
||||||
* information, refer to the comments there
|
|
||||||
*/
|
|
||||||
if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
|
if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ void
|
|||||||
_PG_init(void)
|
_PG_init(void)
|
||||||
{
|
{
|
||||||
pg_init_libpagestore();
|
pg_init_libpagestore();
|
||||||
pg_init_libpqwalproposer();
|
|
||||||
pg_init_walproposer();
|
pg_init_walproposer();
|
||||||
|
|
||||||
EmitWarningsOnPlaceholders("neon");
|
EmitWarningsOnPlaceholders("neon");
|
||||||
|
|||||||
@@ -13,7 +13,6 @@
|
|||||||
#define NEON_H
|
#define NEON_H
|
||||||
|
|
||||||
extern void pg_init_libpagestore(void);
|
extern void pg_init_libpagestore(void);
|
||||||
extern void pg_init_libpqwalproposer(void);
|
|
||||||
extern void pg_init_walproposer(void);
|
extern void pg_init_walproposer(void);
|
||||||
|
|
||||||
#endif /* NEON_H */
|
#endif /* NEON_H */
|
||||||
|
|||||||
@@ -79,9 +79,6 @@ bool am_wal_proposer;
|
|||||||
char *neon_timeline_walproposer = NULL;
|
char *neon_timeline_walproposer = NULL;
|
||||||
char *neon_tenant_walproposer = NULL;
|
char *neon_tenant_walproposer = NULL;
|
||||||
|
|
||||||
/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
|
|
||||||
WalProposerFunctionsType *WalProposerFunctions = NULL;
|
|
||||||
|
|
||||||
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
|
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
|
||||||
|
|
||||||
static int n_safekeepers = 0;
|
static int n_safekeepers = 0;
|
||||||
@@ -438,10 +435,6 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
|
|||||||
char *sep;
|
char *sep;
|
||||||
char *port;
|
char *port;
|
||||||
|
|
||||||
/* Load the libpq-specific functions */
|
|
||||||
if (WalProposerFunctions == NULL)
|
|
||||||
elog(ERROR, "libpqwalproposer didn't initialize correctly");
|
|
||||||
|
|
||||||
load_file("libpqwalreceiver", false);
|
load_file("libpqwalreceiver", false);
|
||||||
if (WalReceiverFunctions == NULL)
|
if (WalReceiverFunctions == NULL)
|
||||||
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
|
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
|
||||||
@@ -1471,12 +1464,6 @@ SendProposerElected(Safekeeper *sk)
|
|||||||
*/
|
*/
|
||||||
th = &sk->voteResponse.termHistory;
|
th = &sk->voteResponse.termHistory;
|
||||||
|
|
||||||
/*
|
|
||||||
* If any WAL is present on the sk, it must be authorized by some term.
|
|
||||||
* OTOH, without any WAL there are no term swiches in the log.
|
|
||||||
*/
|
|
||||||
Assert((th->n_entries == 0) ==
|
|
||||||
(sk->voteResponse.flushLsn == InvalidXLogRecPtr));
|
|
||||||
/* We must start somewhere. */
|
/* We must start somewhere. */
|
||||||
Assert(propTermHistory.n_entries >= 1);
|
Assert(propTermHistory.n_entries >= 1);
|
||||||
|
|
||||||
|
|||||||
@@ -446,31 +446,31 @@ typedef enum
|
|||||||
} WalProposerConnStatusType;
|
} WalProposerConnStatusType;
|
||||||
|
|
||||||
/* Re-exported PQerrorMessage */
|
/* Re-exported PQerrorMessage */
|
||||||
typedef char *(*walprop_error_message_fn) (WalProposerConn * conn);
|
extern char *walprop_error_message(WalProposerConn *conn);
|
||||||
|
|
||||||
/* Re-exported PQstatus */
|
/* Re-exported PQstatus */
|
||||||
typedef WalProposerConnStatusType(*walprop_status_fn) (WalProposerConn * conn);
|
extern WalProposerConnStatusType walprop_status(WalProposerConn *conn);
|
||||||
|
|
||||||
/* Re-exported PQconnectStart */
|
/* Re-exported PQconnectStart */
|
||||||
typedef WalProposerConn * (*walprop_connect_start_fn) (char *conninfo);
|
extern WalProposerConn * walprop_connect_start(char *conninfo);
|
||||||
|
|
||||||
/* Re-exported PQconectPoll */
|
/* Re-exported PQconectPoll */
|
||||||
typedef WalProposerConnectPollStatusType(*walprop_connect_poll_fn) (WalProposerConn * conn);
|
extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn);
|
||||||
|
|
||||||
/* Blocking wrapper around PQsendQuery */
|
/* Blocking wrapper around PQsendQuery */
|
||||||
typedef bool (*walprop_send_query_fn) (WalProposerConn * conn, char *query);
|
extern bool walprop_send_query(WalProposerConn *conn, char *query);
|
||||||
|
|
||||||
/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
|
/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
|
||||||
typedef WalProposerExecStatusType(*walprop_get_query_result_fn) (WalProposerConn * conn);
|
extern WalProposerExecStatusType walprop_get_query_result(WalProposerConn *conn);
|
||||||
|
|
||||||
/* Re-exported PQsocket */
|
/* Re-exported PQsocket */
|
||||||
typedef pgsocket (*walprop_socket_fn) (WalProposerConn * conn);
|
extern pgsocket walprop_socket(WalProposerConn *conn);
|
||||||
|
|
||||||
/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
|
/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
|
||||||
typedef int (*walprop_flush_fn) (WalProposerConn * conn);
|
extern int walprop_flush(WalProposerConn *conn);
|
||||||
|
|
||||||
/* Re-exported PQfinish */
|
/* Re-exported PQfinish */
|
||||||
typedef void (*walprop_finish_fn) (WalProposerConn * conn);
|
extern void walprop_finish(WalProposerConn *conn);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Ergonomic wrapper around PGgetCopyData
|
* Ergonomic wrapper around PGgetCopyData
|
||||||
@@ -486,9 +486,7 @@ typedef void (*walprop_finish_fn) (WalProposerConn * conn);
|
|||||||
* performs a bit of extra checking work that's always required and is normally
|
* performs a bit of extra checking work that's always required and is normally
|
||||||
* somewhat verbose.
|
* somewhat verbose.
|
||||||
*/
|
*/
|
||||||
typedef PGAsyncReadResult(*walprop_async_read_fn) (WalProposerConn * conn,
|
extern PGAsyncReadResult walprop_async_read(WalProposerConn *conn, char **buf, int *amount);
|
||||||
char **buf,
|
|
||||||
int *amount);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Ergonomic wrapper around PQputCopyData + PQflush
|
* Ergonomic wrapper around PQputCopyData + PQflush
|
||||||
@@ -497,69 +495,14 @@ typedef PGAsyncReadResult(*walprop_async_read_fn) (WalProposerConn * conn,
|
|||||||
*
|
*
|
||||||
* For information on the meaning of return codes, refer to PGAsyncWriteResult.
|
* For information on the meaning of return codes, refer to PGAsyncWriteResult.
|
||||||
*/
|
*/
|
||||||
typedef PGAsyncWriteResult(*walprop_async_write_fn) (WalProposerConn * conn,
|
extern PGAsyncWriteResult walprop_async_write(WalProposerConn *conn, void const *buf, size_t size);
|
||||||
void const *buf,
|
|
||||||
size_t size);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Blocking equivalent to walprop_async_write_fn
|
* Blocking equivalent to walprop_async_write_fn
|
||||||
*
|
*
|
||||||
* Returns 'true' if successful, 'false' on failure.
|
* Returns 'true' if successful, 'false' on failure.
|
||||||
*/
|
*/
|
||||||
typedef bool (*walprop_blocking_write_fn) (WalProposerConn * conn, void const *buf, size_t size);
|
extern bool walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size);
|
||||||
|
|
||||||
/* All libpqwalproposer exported functions collected together. */
|
|
||||||
typedef struct WalProposerFunctionsType
|
|
||||||
{
|
|
||||||
walprop_error_message_fn walprop_error_message;
|
|
||||||
walprop_status_fn walprop_status;
|
|
||||||
walprop_connect_start_fn walprop_connect_start;
|
|
||||||
walprop_connect_poll_fn walprop_connect_poll;
|
|
||||||
walprop_send_query_fn walprop_send_query;
|
|
||||||
walprop_get_query_result_fn walprop_get_query_result;
|
|
||||||
walprop_socket_fn walprop_socket;
|
|
||||||
walprop_flush_fn walprop_flush;
|
|
||||||
walprop_finish_fn walprop_finish;
|
|
||||||
walprop_async_read_fn walprop_async_read;
|
|
||||||
walprop_async_write_fn walprop_async_write;
|
|
||||||
walprop_blocking_write_fn walprop_blocking_write;
|
|
||||||
} WalProposerFunctionsType;
|
|
||||||
|
|
||||||
/* Allow the above functions to be "called" with normal syntax */
|
|
||||||
#define walprop_error_message(conn) \
|
|
||||||
WalProposerFunctions->walprop_error_message(conn)
|
|
||||||
#define walprop_status(conn) \
|
|
||||||
WalProposerFunctions->walprop_status(conn)
|
|
||||||
#define walprop_connect_start(conninfo) \
|
|
||||||
WalProposerFunctions->walprop_connect_start(conninfo)
|
|
||||||
#define walprop_connect_poll(conn) \
|
|
||||||
WalProposerFunctions->walprop_connect_poll(conn)
|
|
||||||
#define walprop_send_query(conn, query) \
|
|
||||||
WalProposerFunctions->walprop_send_query(conn, query)
|
|
||||||
#define walprop_get_query_result(conn) \
|
|
||||||
WalProposerFunctions->walprop_get_query_result(conn)
|
|
||||||
#define walprop_set_nonblocking(conn, arg) \
|
|
||||||
WalProposerFunctions->walprop_set_nonblocking(conn, arg)
|
|
||||||
#define walprop_socket(conn) \
|
|
||||||
WalProposerFunctions->walprop_socket(conn)
|
|
||||||
#define walprop_flush(conn) \
|
|
||||||
WalProposerFunctions->walprop_flush(conn)
|
|
||||||
#define walprop_finish(conn) \
|
|
||||||
WalProposerFunctions->walprop_finish(conn)
|
|
||||||
#define walprop_async_read(conn, buf, amount) \
|
|
||||||
WalProposerFunctions->walprop_async_read(conn, buf, amount)
|
|
||||||
#define walprop_async_write(conn, buf, size) \
|
|
||||||
WalProposerFunctions->walprop_async_write(conn, buf, size)
|
|
||||||
#define walprop_blocking_write(conn, buf, size) \
|
|
||||||
WalProposerFunctions->walprop_blocking_write(conn, buf, size)
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The runtime location of the libpqwalproposer functions.
|
|
||||||
*
|
|
||||||
* This pointer is set by the initializer in libpqwalproposer, so that we
|
|
||||||
* can use it later.
|
|
||||||
*/
|
|
||||||
extern PGDLLIMPORT WalProposerFunctionsType * WalProposerFunctions;
|
|
||||||
|
|
||||||
extern uint64 BackpressureThrottlingTime(void);
|
extern uint64 BackpressureThrottlingTime(void);
|
||||||
|
|
||||||
|
|||||||
@@ -5,11 +5,11 @@ edition = "2021"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0"
|
anyhow = "1.0"
|
||||||
async-trait = "0.1"
|
atty = "0.2.14"
|
||||||
base64 = "0.13.0"
|
base64 = "0.13.0"
|
||||||
bstr = "0.2.17"
|
bstr = "1.0"
|
||||||
bytes = { version = "1.0.1", features = ['serde'] }
|
bytes = { version = "1.0.1", features = ['serde'] }
|
||||||
clap = "3.0"
|
clap = "4.0"
|
||||||
futures = "0.3.13"
|
futures = "0.3.13"
|
||||||
git-version = "0.3.5"
|
git-version = "0.3.5"
|
||||||
hashbrown = "0.12"
|
hashbrown = "0.12"
|
||||||
@@ -22,7 +22,11 @@ once_cell = "1.13.0"
|
|||||||
parking_lot = "0.12"
|
parking_lot = "0.12"
|
||||||
pin-project-lite = "0.2.7"
|
pin-project-lite = "0.2.7"
|
||||||
rand = "0.8.3"
|
rand = "0.8.3"
|
||||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
reqwest = { version = "0.11", default-features = false, features = [
|
||||||
|
"blocking",
|
||||||
|
"json",
|
||||||
|
"rustls-tls",
|
||||||
|
] }
|
||||||
routerify = "3"
|
routerify = "3"
|
||||||
rustls = "0.20.0"
|
rustls = "0.20.0"
|
||||||
rustls-pemfile = "1"
|
rustls-pemfile = "1"
|
||||||
@@ -33,17 +37,20 @@ sha2 = "0.10.2"
|
|||||||
socket2 = "0.4.4"
|
socket2 = "0.4.4"
|
||||||
thiserror = "1.0.30"
|
thiserror = "1.0.30"
|
||||||
tokio = { version = "1.17", features = ["macros"] }
|
tokio = { version = "1.17", features = ["macros"] }
|
||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||||
tokio-rustls = "0.23.0"
|
tokio-rustls = "0.23.0"
|
||||||
|
tracing = "0.1.36"
|
||||||
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
url = "2.2.2"
|
url = "2.2.2"
|
||||||
uuid = { version = "0.8.2", features = ["v4", "serde"]}
|
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||||
x509-parser = "0.13.2"
|
x509-parser = "0.14"
|
||||||
|
|
||||||
utils = { path = "../libs/utils" }
|
utils = { path = "../libs/utils" }
|
||||||
metrics = { path = "../libs/metrics" }
|
metrics = { path = "../libs/metrics" }
|
||||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
rcgen = "0.8.14"
|
async-trait = "0.1"
|
||||||
rstest = "0.12"
|
rcgen = "0.10"
|
||||||
|
rstest = "0.15"
|
||||||
tokio-postgres-rustls = "0.9.0"
|
tokio-postgres-rustls = "0.9.0"
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ use once_cell::sync::Lazy;
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
|
use tracing::{info, warn};
|
||||||
|
|
||||||
static CPLANE_WAITERS: Lazy<Waiters<mgmt::ComputeReady>> = Lazy::new(Default::default);
|
static CPLANE_WAITERS: Lazy<Waiters<mgmt::ComputeReady>> = Lazy::new(Default::default);
|
||||||
|
|
||||||
@@ -171,6 +172,8 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
|||||||
// support SNI or other means of passing the project name.
|
// support SNI or other means of passing the project name.
|
||||||
// We now expect to see a very specific payload in the place of password.
|
// We now expect to see a very specific payload in the place of password.
|
||||||
if creds.project().is_none() {
|
if creds.project().is_none() {
|
||||||
|
warn!("project name not specified, resorting to the password hack auth flow");
|
||||||
|
|
||||||
let payload = AuthFlow::new(client)
|
let payload = AuthFlow::new(client)
|
||||||
.begin(auth::PasswordHack)
|
.begin(auth::PasswordHack)
|
||||||
.await?
|
.await?
|
||||||
@@ -179,6 +182,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
|||||||
|
|
||||||
// Finally we may finish the initialization of `creds`.
|
// Finally we may finish the initialization of `creds`.
|
||||||
// TODO: add missing type safety to ClientCredentials.
|
// TODO: add missing type safety to ClientCredentials.
|
||||||
|
info!(project = &payload.project, "received missing parameter");
|
||||||
creds.project = Some(payload.project.into());
|
creds.project = Some(payload.project.into());
|
||||||
|
|
||||||
let mut config = match &self {
|
let mut config = match &self {
|
||||||
@@ -196,6 +200,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
|||||||
// We should use a password from payload as well.
|
// We should use a password from payload as well.
|
||||||
config.password(payload.password);
|
config.password(payload.password);
|
||||||
|
|
||||||
|
info!("user successfully authenticated (using the password hack)");
|
||||||
return Ok(compute::NodeInfo {
|
return Ok(compute::NodeInfo {
|
||||||
reported_auth_ok: false,
|
reported_auth_ok: false,
|
||||||
config,
|
config,
|
||||||
@@ -203,19 +208,31 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
match self {
|
let res = match self {
|
||||||
Console(endpoint, creds) => {
|
Console(endpoint, creds) => {
|
||||||
|
info!(
|
||||||
|
user = creds.user,
|
||||||
|
project = creds.project(),
|
||||||
|
"performing authentication using the console"
|
||||||
|
);
|
||||||
console::Api::new(&endpoint, extra, &creds)
|
console::Api::new(&endpoint, extra, &creds)
|
||||||
.handle_user(client)
|
.handle_user(client)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
Postgres(endpoint, creds) => {
|
Postgres(endpoint, creds) => {
|
||||||
|
info!("performing mock authentication using a local postgres instance");
|
||||||
postgres::Api::new(&endpoint, &creds)
|
postgres::Api::new(&endpoint, &creds)
|
||||||
.handle_user(client)
|
.handle_user(client)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
// NOTE: this auth backend doesn't use client credentials.
|
// NOTE: this auth backend doesn't use client credentials.
|
||||||
Link(url) => link::handle_user(&url, client).await,
|
Link(url) => {
|
||||||
}
|
info!("performing link authentication");
|
||||||
|
link::handle_user(&url, client).await
|
||||||
|
}
|
||||||
|
}?;
|
||||||
|
|
||||||
|
info!("user successfully authenticated");
|
||||||
|
Ok(res)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,35 +8,20 @@ use crate::{
|
|||||||
http, scram,
|
http, scram,
|
||||||
stream::PqStream,
|
stream::PqStream,
|
||||||
};
|
};
|
||||||
|
use futures::TryFutureExt;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::future::Future;
|
use std::future::Future;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
|
use tracing::{error, info, info_span};
|
||||||
|
|
||||||
const REQUEST_FAILED: &str = "Console request failed";
|
const REQUEST_FAILED: &str = "Console request failed";
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
pub enum TransportError {
|
#[error("{}", REQUEST_FAILED)]
|
||||||
#[error("Console responded with a malformed JSON: {0}")]
|
pub struct TransportError(#[from] std::io::Error);
|
||||||
BadResponse(#[from] serde_json::Error),
|
|
||||||
|
|
||||||
/// HTTP status (other than 200) returned by the console.
|
impl UserFacingError for TransportError {}
|
||||||
#[error("Console responded with an HTTP status: {0}")]
|
|
||||||
HttpStatus(reqwest::StatusCode),
|
|
||||||
|
|
||||||
#[error(transparent)]
|
|
||||||
Io(#[from] std::io::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl UserFacingError for TransportError {
|
|
||||||
fn to_string_client(&self) -> String {
|
|
||||||
use TransportError::*;
|
|
||||||
match self {
|
|
||||||
HttpStatus(_) => self.to_string(),
|
|
||||||
_ => REQUEST_FAILED.to_owned(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helps eliminate graceless `.map_err` calls without introducing another ctor.
|
// Helps eliminate graceless `.map_err` calls without introducing another ctor.
|
||||||
impl From<reqwest::Error> for TransportError {
|
impl From<reqwest::Error> for TransportError {
|
||||||
@@ -148,10 +133,11 @@ impl<'a> Api<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn get_auth_info(&self) -> Result<AuthInfo, GetAuthInfoError> {
|
async fn get_auth_info(&self) -> Result<AuthInfo, GetAuthInfoError> {
|
||||||
|
let request_id = uuid::Uuid::new_v4().to_string();
|
||||||
let req = self
|
let req = self
|
||||||
.endpoint
|
.endpoint
|
||||||
.get("proxy_get_role_secret")
|
.get("proxy_get_role_secret")
|
||||||
.header("X-Request-ID", uuid::Uuid::new_v4().to_string())
|
.header("X-Request-ID", &request_id)
|
||||||
.query(&[("session_id", self.extra.session_id)])
|
.query(&[("session_id", self.extra.session_id)])
|
||||||
.query(&[
|
.query(&[
|
||||||
("application_name", self.extra.application_name),
|
("application_name", self.extra.application_name),
|
||||||
@@ -160,27 +146,30 @@ impl<'a> Api<'a> {
|
|||||||
])
|
])
|
||||||
.build()?;
|
.build()?;
|
||||||
|
|
||||||
// TODO: use a proper logger
|
let span = info_span!("http", id = request_id, url = req.url().as_str());
|
||||||
println!("cplane request: {}", req.url());
|
info!(parent: &span, "request auth info");
|
||||||
|
let msg = self
|
||||||
|
.endpoint
|
||||||
|
.checked_execute(req)
|
||||||
|
.and_then(|r| r.json::<GetRoleSecretResponse>())
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
error!(parent: &span, "{e}");
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
|
||||||
let resp = self.endpoint.execute(req).await?;
|
scram::ServerSecret::parse(&msg.role_secret)
|
||||||
if !resp.status().is_success() {
|
|
||||||
return Err(TransportError::HttpStatus(resp.status()).into());
|
|
||||||
}
|
|
||||||
|
|
||||||
let response: GetRoleSecretResponse = serde_json::from_str(&resp.text().await?)?;
|
|
||||||
|
|
||||||
scram::ServerSecret::parse(&response.role_secret)
|
|
||||||
.map(AuthInfo::Scram)
|
.map(AuthInfo::Scram)
|
||||||
.ok_or(GetAuthInfoError::BadSecret)
|
.ok_or(GetAuthInfoError::BadSecret)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wake up the compute node and return the corresponding connection info.
|
/// Wake up the compute node and return the corresponding connection info.
|
||||||
pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
|
pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
|
||||||
|
let request_id = uuid::Uuid::new_v4().to_string();
|
||||||
let req = self
|
let req = self
|
||||||
.endpoint
|
.endpoint
|
||||||
.get("proxy_wake_compute")
|
.get("proxy_wake_compute")
|
||||||
.header("X-Request-ID", uuid::Uuid::new_v4().to_string())
|
.header("X-Request-ID", &request_id)
|
||||||
.query(&[("session_id", self.extra.session_id)])
|
.query(&[("session_id", self.extra.session_id)])
|
||||||
.query(&[
|
.query(&[
|
||||||
("application_name", self.extra.application_name),
|
("application_name", self.extra.application_name),
|
||||||
@@ -188,19 +177,21 @@ impl<'a> Api<'a> {
|
|||||||
])
|
])
|
||||||
.build()?;
|
.build()?;
|
||||||
|
|
||||||
// TODO: use a proper logger
|
let span = info_span!("http", id = request_id, url = req.url().as_str());
|
||||||
println!("cplane request: {}", req.url());
|
info!(parent: &span, "request wake-up");
|
||||||
|
let msg = self
|
||||||
let resp = self.endpoint.execute(req).await?;
|
.endpoint
|
||||||
if !resp.status().is_success() {
|
.checked_execute(req)
|
||||||
return Err(TransportError::HttpStatus(resp.status()).into());
|
.and_then(|r| r.json::<GetWakeComputeResponse>())
|
||||||
}
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
let response: GetWakeComputeResponse = serde_json::from_str(&resp.text().await?)?;
|
error!(parent: &span, "{e}");
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
|
||||||
// Unfortunately, ownership won't let us use `Option::ok_or` here.
|
// Unfortunately, ownership won't let us use `Option::ok_or` here.
|
||||||
let (host, port) = match parse_host_port(&response.address) {
|
let (host, port) = match parse_host_port(&msg.address) {
|
||||||
None => return Err(WakeComputeError::BadComputeAddress(response.address)),
|
None => return Err(WakeComputeError::BadComputeAddress(msg.address)),
|
||||||
Some(x) => x,
|
Some(x) => x,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -227,15 +218,18 @@ where
|
|||||||
GetAuthInfo: Future<Output = Result<AuthInfo, GetAuthInfoError>>,
|
GetAuthInfo: Future<Output = Result<AuthInfo, GetAuthInfoError>>,
|
||||||
WakeCompute: Future<Output = Result<ComputeConnCfg, WakeComputeError>>,
|
WakeCompute: Future<Output = Result<ComputeConnCfg, WakeComputeError>>,
|
||||||
{
|
{
|
||||||
|
info!("fetching user's authentication info");
|
||||||
let auth_info = get_auth_info(endpoint).await?;
|
let auth_info = get_auth_info(endpoint).await?;
|
||||||
|
|
||||||
let flow = AuthFlow::new(client);
|
let flow = AuthFlow::new(client);
|
||||||
let scram_keys = match auth_info {
|
let scram_keys = match auth_info {
|
||||||
AuthInfo::Md5(_) => {
|
AuthInfo::Md5(_) => {
|
||||||
// TODO: decide if we should support MD5 in api v2
|
// TODO: decide if we should support MD5 in api v2
|
||||||
|
info!("auth endpoint chooses MD5");
|
||||||
return Err(auth::AuthError::bad_auth_method("MD5"));
|
return Err(auth::AuthError::bad_auth_method("MD5"));
|
||||||
}
|
}
|
||||||
AuthInfo::Scram(secret) => {
|
AuthInfo::Scram(secret) => {
|
||||||
|
info!("auth endpoint chooses SCRAM");
|
||||||
let scram = auth::Scram(&secret);
|
let scram = auth::Scram(&secret);
|
||||||
Some(compute::ScramKeys {
|
Some(compute::ScramKeys {
|
||||||
client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(),
|
client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(),
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user