mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-19 11:22:56 +00:00
Compare commits
53 Commits
layer_map_
...
page_cache
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
47d29613a7 | ||
|
|
bc5ec43056 | ||
|
|
b237feedab | ||
|
|
4d1e48f3b9 | ||
|
|
7576b18b14 | ||
|
|
6b49b370fc | ||
|
|
91411c415a | ||
|
|
c67cf34040 | ||
|
|
8fbe437768 | ||
|
|
989d78aac8 | ||
|
|
7ca72578f9 | ||
|
|
41550ec8bf | ||
|
|
0cd2d91b9d | ||
|
|
546e9bdbec | ||
|
|
59bc7e67e0 | ||
|
|
2418e72649 | ||
|
|
80746b1c7a | ||
|
|
129f7c82b7 | ||
|
|
0ec5ddea0b | ||
|
|
c4ee62d427 | ||
|
|
c709354579 | ||
|
|
5d6553d41d | ||
|
|
f03b7c3458 | ||
|
|
9c24de254f | ||
|
|
538876650a | ||
|
|
500239176c | ||
|
|
ee64a6b80b | ||
|
|
a13b486943 | ||
|
|
9fe4548e13 | ||
|
|
14c623b254 | ||
|
|
ebf54b0de0 | ||
|
|
09dda35dac | ||
|
|
6ace79345d | ||
|
|
771e61425e | ||
|
|
93775f6ca7 | ||
|
|
6d0dacc4ce | ||
|
|
e5e40a31f4 | ||
|
|
676c63c329 | ||
|
|
47366522a8 | ||
|
|
db26bc49cc | ||
|
|
e520293090 | ||
|
|
241e549757 | ||
|
|
34bea270f0 | ||
|
|
13f0e7a5b4 | ||
|
|
3e35f10adc | ||
|
|
3be3bb7730 | ||
|
|
01d2c52c82 | ||
|
|
9f79e7edea | ||
|
|
a22165d41e | ||
|
|
725be60bb7 | ||
|
|
e516c376d6 | ||
|
|
8e51c27e1a | ||
|
|
9e1eb69d55 |
2
.github/PULL_REQUEST_TEMPLATE/release-pr.md
vendored
2
.github/PULL_REQUEST_TEMPLATE/release-pr.md
vendored
@@ -10,7 +10,7 @@
|
||||
<!-- List everything that should be done **before** release, any issues / setting changes / etc -->
|
||||
|
||||
### Checklist after release
|
||||
- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/120/files))
|
||||
- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files))
|
||||
- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
|
||||
- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
|
||||
- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
|
||||
|
||||
4
.github/actions/allure-report/action.yml
vendored
4
.github/actions/allure-report/action.yml
vendored
@@ -47,7 +47,7 @@ runs:
|
||||
else
|
||||
key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-")
|
||||
fi
|
||||
echo "::set-output name=KEY::${key}"
|
||||
echo "KEY=${key}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/setup-java@v3
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
@@ -186,7 +186,7 @@ runs:
|
||||
aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
|
||||
|
||||
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
|
||||
echo "::set-output name=report-url::${REPORT_URL}"
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Release Allure lock
|
||||
if: ${{ inputs.action == 'generate' && always() }}
|
||||
|
||||
4
.github/actions/download/action.yml
vendored
4
.github/actions/download/action.yml
vendored
@@ -34,7 +34,7 @@ runs:
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
|
||||
echo '::set-output name=SKIPPED::true'
|
||||
echo 'SKIPPED=true' >> $GITHUB_OUTPUT
|
||||
exit 0
|
||||
else
|
||||
echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
||||
@@ -42,7 +42,7 @@ runs:
|
||||
fi
|
||||
fi
|
||||
|
||||
echo '::set-output name=SKIPPED::false'
|
||||
echo 'SKIPPED=false' >> $GITHUB_OUTPUT
|
||||
|
||||
mkdir -p $(dirname $ARCHIVE)
|
||||
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} ${ARCHIVE}
|
||||
|
||||
@@ -41,8 +41,8 @@ runs:
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "::set-output name=api_host::${API_HOST}"
|
||||
echo "::set-output name=region_id::${REGION_ID}"
|
||||
echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
|
||||
echo "region_id=${REGION_ID}" >> $GITHUB_OUTPUT
|
||||
env:
|
||||
ENVIRONMENT: ${{ inputs.environment }}
|
||||
REGION_ID: ${{ inputs.region_id }}
|
||||
@@ -72,10 +72,10 @@ runs:
|
||||
|
||||
dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main
|
||||
echo "::add-mask::${dsn}"
|
||||
echo "::set-output name=dsn::${dsn}"
|
||||
echo "dsn=${dsn}" >> $GITHUB_OUTPUT
|
||||
|
||||
project_id=$(echo $project | jq --raw-output '.id')
|
||||
echo "::set-output name=project_id::${project_id}"
|
||||
echo "project_id=${project_id}" >> $GITHUB_OUTPUT
|
||||
env:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
API_HOST: ${{ steps.parse-input.outputs.api_host }}
|
||||
|
||||
@@ -32,7 +32,7 @@ runs:
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "::set-output name=api_host::${API_HOST}"
|
||||
echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
|
||||
env:
|
||||
ENVIRONMENT: ${{ inputs.environment }}
|
||||
|
||||
|
||||
3
.github/ansible/.gitignore
vendored
3
.github/ansible/.gitignore
vendored
@@ -2,3 +2,6 @@ zenith_install.tar.gz
|
||||
.zenith_current_version
|
||||
neon_install.tar.gz
|
||||
.neon_current_version
|
||||
|
||||
collections/*
|
||||
!collections/.keep
|
||||
|
||||
1
.github/ansible/ansible.cfg
vendored
1
.github/ansible/ansible.cfg
vendored
@@ -3,6 +3,7 @@
|
||||
localhost_warning = False
|
||||
host_key_checking = False
|
||||
timeout = 30
|
||||
collections_paths = ./collections
|
||||
|
||||
[ssh_connection]
|
||||
ssh_args = -F ./ansible.ssh.cfg
|
||||
|
||||
0
.github/ansible/collections/.keep
vendored
Normal file
0
.github/ansible/collections/.keep
vendored
Normal file
41
.github/ansible/deploy.yaml
vendored
41
.github/ansible/deploy.yaml
vendored
@@ -1,7 +1,7 @@
|
||||
- name: Upload Neon binaries
|
||||
hosts: storage
|
||||
gather_facts: False
|
||||
remote_user: admin
|
||||
remote_user: "{{ remote_user }}"
|
||||
|
||||
tasks:
|
||||
|
||||
@@ -14,7 +14,8 @@
|
||||
- safekeeper
|
||||
|
||||
- name: inform about versions
|
||||
debug: msg="Version to deploy - {{ current_version }}"
|
||||
debug:
|
||||
msg: "Version to deploy - {{ current_version }}"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
@@ -35,7 +36,7 @@
|
||||
- name: Deploy pageserver
|
||||
hosts: pageservers
|
||||
gather_facts: False
|
||||
remote_user: admin
|
||||
remote_user: "{{ remote_user }}"
|
||||
|
||||
tasks:
|
||||
|
||||
@@ -63,15 +64,29 @@
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: update remote storage (s3) config
|
||||
lineinfile:
|
||||
path: /storage/pageserver/data/pageserver.toml
|
||||
line: "{{ item }}"
|
||||
loop:
|
||||
- "[remote_storage]"
|
||||
- "bucket_name = '{{ bucket_name }}'"
|
||||
- "bucket_region = '{{ bucket_region }}'"
|
||||
- "prefix_in_bucket = '{{ inventory_hostname }}'"
|
||||
- name: read the existing remote pageserver config
|
||||
ansible.builtin.slurp:
|
||||
src: /storage/pageserver/data/pageserver.toml
|
||||
register: _remote_ps_config
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: parse the existing pageserver configuration
|
||||
ansible.builtin.set_fact:
|
||||
_existing_ps_config: "{{ _remote_ps_config['content'] | b64decode | sivel.toiletwater.from_toml }}"
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: construct the final pageserver configuration dict
|
||||
ansible.builtin.set_fact:
|
||||
pageserver_config: "{{ pageserver_config_stub | combine({'id': _existing_ps_config.id }) }}"
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: template the pageserver config
|
||||
template:
|
||||
src: templates/pageserver.toml.j2
|
||||
dest: /storage/pageserver/data/pageserver.toml
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
@@ -109,7 +124,7 @@
|
||||
- name: Deploy safekeeper
|
||||
hosts: safekeepers
|
||||
gather_facts: False
|
||||
remote_user: admin
|
||||
remote_user: "{{ remote_user }}"
|
||||
|
||||
tasks:
|
||||
|
||||
|
||||
1
.github/ansible/get_binaries.sh
vendored
1
.github/ansible/get_binaries.sh
vendored
@@ -23,6 +23,7 @@ docker cp ${ID}:/data/postgres_install.tar.gz .
|
||||
tar -xzf postgres_install.tar.gz -C neon_install
|
||||
mkdir neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/pageserver_binutils neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/
|
||||
|
||||
20
.github/ansible/neon-stress.hosts
vendored
20
.github/ansible/neon-stress.hosts
vendored
@@ -1,20 +0,0 @@
|
||||
[pageservers]
|
||||
neon-stress-ps-1 console_region_id=1
|
||||
neon-stress-ps-2 console_region_id=1
|
||||
|
||||
[safekeepers]
|
||||
neon-stress-sk-1 console_region_id=1
|
||||
neon-stress-sk-2 console_region_id=1
|
||||
neon-stress-sk-3 console_region_id=1
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
env_name = neon-stress
|
||||
console_mgmt_base_url = http://neon-stress-console.local
|
||||
bucket_name = neon-storage-ireland
|
||||
bucket_region = eu-west-1
|
||||
etcd_endpoints = etcd-stress.local:2379
|
||||
safekeeper_enable_s3_offload = false
|
||||
31
.github/ansible/neon-stress.hosts.yaml
vendored
Normal file
31
.github/ansible/neon-stress.hosts.yaml
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-storage-ireland
|
||||
bucket_region: eu-west-1
|
||||
console_mgmt_base_url: http://neon-stress-console.local
|
||||
env_name: neon-stress
|
||||
etcd_endpoints: neon-stress-etcd.local:2379
|
||||
safekeeper_enable_s3_offload: 'false'
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||
hostname_suffix: ".local"
|
||||
remote_user: admin
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
neon-stress-ps-1:
|
||||
console_region_id: aws-eu-west-1
|
||||
neon-stress-ps-2:
|
||||
console_region_id: aws-eu-west-1
|
||||
safekeepers:
|
||||
hosts:
|
||||
neon-stress-sk-1:
|
||||
console_region_id: aws-eu-west-1
|
||||
neon-stress-sk-2:
|
||||
console_region_id: aws-eu-west-1
|
||||
neon-stress-sk-3:
|
||||
console_region_id: aws-eu-west-1
|
||||
20
.github/ansible/production.hosts
vendored
20
.github/ansible/production.hosts
vendored
@@ -1,20 +0,0 @@
|
||||
[pageservers]
|
||||
#zenith-1-ps-1 console_region_id=1
|
||||
zenith-1-ps-2 console_region_id=1
|
||||
zenith-1-ps-3 console_region_id=1
|
||||
|
||||
[safekeepers]
|
||||
zenith-1-sk-1 console_region_id=1
|
||||
zenith-1-sk-2 console_region_id=1
|
||||
zenith-1-sk-3 console_region_id=1
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
env_name = prod-1
|
||||
console_mgmt_base_url = http://console-release.local
|
||||
bucket_name = zenith-storage-oregon
|
||||
bucket_region = us-west-2
|
||||
etcd_endpoints = zenith-1-etcd.local:2379
|
||||
33
.github/ansible/production.hosts.yaml
vendored
Normal file
33
.github/ansible/production.hosts.yaml
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
---
|
||||
storage:
|
||||
vars:
|
||||
env_name: prod-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
bucket_name: zenith-storage-oregon
|
||||
bucket_region: us-west-2
|
||||
etcd_endpoints: zenith-1-etcd.local:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||
hostname_suffix: ".local"
|
||||
remote_user: admin
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
zenith-1-ps-2:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-3:
|
||||
console_region_id: aws-us-west-2
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
zenith-1-sk-1:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-sk-2:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-sk-3:
|
||||
console_region_id: aws-us-west-2
|
||||
9
.github/ansible/scripts/init_pageserver.sh
vendored
9
.github/ansible/scripts/init_pageserver.sh
vendored
@@ -12,18 +12,19 @@ cat <<EOF | tee /tmp/payload
|
||||
"version": 1,
|
||||
"host": "${HOST}",
|
||||
"port": 6400,
|
||||
"region_id": {{ console_region_id }},
|
||||
"region_id": "{{ console_region_id }}",
|
||||
"instance_id": "${INSTANCE_ID}",
|
||||
"http_host": "${HOST}",
|
||||
"http_port": 9898
|
||||
"http_port": 9898,
|
||||
"active": false
|
||||
}
|
||||
EOF
|
||||
|
||||
# check if pageserver already registered or not
|
||||
if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/pageservers/${INSTANCE_ID} -o /dev/null; then
|
||||
if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/${INSTANCE_ID} -o /dev/null; then
|
||||
|
||||
# not registered, so register it now
|
||||
ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/pageservers -d@/tmp/payload | jq -r '.ID')
|
||||
ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers -d@/tmp/payload | jq -r '.id')
|
||||
|
||||
# init pageserver
|
||||
sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
|
||||
|
||||
10
.github/ansible/scripts/init_safekeeper.sh
vendored
10
.github/ansible/scripts/init_safekeeper.sh
vendored
@@ -14,18 +14,18 @@ cat <<EOF | tee /tmp/payload
|
||||
"host": "${HOST}",
|
||||
"port": 6500,
|
||||
"http_port": 7676,
|
||||
"region_id": {{ console_region_id }},
|
||||
"region_id": "{{ console_region_id }}",
|
||||
"instance_id": "${INSTANCE_ID}",
|
||||
"availability_zone_id": "${AZ_ID}"
|
||||
"availability_zone_id": "${AZ_ID}",
|
||||
"active": false
|
||||
}
|
||||
EOF
|
||||
|
||||
# check if safekeeper already registered or not
|
||||
if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/safekeepers/${INSTANCE_ID} -o /dev/null; then
|
||||
if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/${INSTANCE_ID} -o /dev/null; then
|
||||
|
||||
# not registered, so register it now
|
||||
ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers -d@/tmp/payload | jq -r '.ID')
|
||||
|
||||
ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers -d@/tmp/payload | jq -r '.id')
|
||||
# init safekeeper
|
||||
sudo -u safekeeper /usr/local/bin/safekeeper --id ${ID} --init -D /storage/safekeeper/data
|
||||
fi
|
||||
|
||||
3
.github/ansible/ssm_config
vendored
Normal file
3
.github/ansible/ssm_config
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
ansible_connection: aws_ssm
|
||||
ansible_aws_ssm_bucket_name: neon-dev-bucket
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
25
.github/ansible/staging.hosts
vendored
25
.github/ansible/staging.hosts
vendored
@@ -1,25 +0,0 @@
|
||||
[pageservers]
|
||||
#zenith-us-stage-ps-1 console_region_id=27
|
||||
zenith-us-stage-ps-2 console_region_id=27
|
||||
zenith-us-stage-ps-3 console_region_id=27
|
||||
zenith-us-stage-ps-4 console_region_id=27
|
||||
zenith-us-stage-test-ps-1 console_region_id=28
|
||||
|
||||
[safekeepers]
|
||||
zenith-us-stage-sk-4 console_region_id=27
|
||||
zenith-us-stage-sk-5 console_region_id=27
|
||||
zenith-us-stage-sk-6 console_region_id=27
|
||||
zenith-us-stage-test-sk-1 console_region_id=28
|
||||
zenith-us-stage-test-sk-2 console_region_id=28
|
||||
zenith-us-stage-test-sk-3 console_region_id=28
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
env_name = us-stage
|
||||
console_mgmt_base_url = http://console-staging.local
|
||||
bucket_name = zenith-staging-storage-us-east-1
|
||||
bucket_region = us-east-1
|
||||
etcd_endpoints = zenith-us-stage-etcd.local:2379
|
||||
34
.github/ansible/staging.hosts.yaml
vendored
Normal file
34
.github/ansible/staging.hosts.yaml
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: zenith-staging-storage-us-east-1
|
||||
bucket_region: us-east-1
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
env_name: us-stage
|
||||
etcd_endpoints: zenith-us-stage-etcd.local:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||
hostname_suffix: ".local"
|
||||
remote_user: admin
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
zenith-us-stage-ps-2:
|
||||
console_region_id: aws-us-east-1
|
||||
zenith-us-stage-ps-3:
|
||||
console_region_id: aws-us-east-1
|
||||
zenith-us-stage-ps-4:
|
||||
console_region_id: aws-us-east-1
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
zenith-us-stage-sk-4:
|
||||
console_region_id: aws-us-east-1
|
||||
zenith-us-stage-sk-5:
|
||||
console_region_id: aws-us-east-1
|
||||
zenith-us-stage-sk-6:
|
||||
console_region_id: aws-us-east-1
|
||||
32
.github/ansible/staging.us-east-2.hosts.yaml
vendored
Normal file
32
.github/ansible/staging.us-east-2.hosts.yaml
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-staging-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
env_name: us-stage
|
||||
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: us-east-2
|
||||
console_region_id: aws-us-east-2
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0c3e70929edb5d691
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.us-east-2.aws.neon.build:
|
||||
ansible_host: i-027662bd552bf5db0
|
||||
safekeeper-1.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0171efc3604a7b907
|
||||
safekeeper-2.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0de0b03a51676a6ce
|
||||
2
.github/ansible/systemd/pageserver.service
vendored
2
.github/ansible/systemd/pageserver.service
vendored
@@ -1,5 +1,5 @@
|
||||
[Unit]
|
||||
Description=Zenith pageserver
|
||||
Description=Neon pageserver
|
||||
After=network.target auditd.service
|
||||
|
||||
[Service]
|
||||
|
||||
4
.github/ansible/systemd/safekeeper.service
vendored
4
.github/ansible/systemd/safekeeper.service
vendored
@@ -1,12 +1,12 @@
|
||||
[Unit]
|
||||
Description=Zenith safekeeper
|
||||
Description=Neon safekeeper
|
||||
After=network.target auditd.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
1
.github/ansible/templates/pageserver.toml.j2
vendored
Normal file
1
.github/ansible/templates/pageserver.toml.j2
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{{ pageserver_config | sivel.toiletwater.to_toml }}
|
||||
42
.github/workflows/benchmarking.yml
vendored
42
.github/workflows/benchmarking.yml
vendored
@@ -46,7 +46,7 @@ jobs:
|
||||
runs-on: [self-hosted, zenith-benchmarker]
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/pg_install
|
||||
POSTGRES_DISTRIB_DIR: /usr/pgsql
|
||||
DEFAULT_PG_VERSION: 14
|
||||
|
||||
steps:
|
||||
@@ -138,22 +138,31 @@ jobs:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
pgbench-compare:
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10gb"
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# neon-captest-new: Run pgbench in a freshly created project
|
||||
# neon-captest-reuse: Same, but reusing existing project
|
||||
# neon-captest-prefetch: Same, with prefetching enabled (new project)
|
||||
platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-aurora ]
|
||||
platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch ]
|
||||
db_size: [ 10gb ]
|
||||
include:
|
||||
- platform: neon-captest-new
|
||||
db_size: 50gb
|
||||
- platform: neon-captest-prefetch
|
||||
db_size: 50gb
|
||||
- platform: rds-aurora
|
||||
db_size: 50gb
|
||||
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
runs-on: dev
|
||||
container:
|
||||
@@ -178,7 +187,7 @@ jobs:
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Create Neon Project
|
||||
if: matrix.platform != 'neon-captest-reuse'
|
||||
if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch"]'), matrix.platform)
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
@@ -204,11 +213,9 @@ jobs:
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "::set-output name=connstr::${CONNSTR}"
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
env:
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
- name: Set database options
|
||||
if: matrix.platform == 'neon-captest-prefetch'
|
||||
@@ -227,7 +234,6 @@ jobs:
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
|
||||
env:
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -241,7 +247,6 @@ jobs:
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
|
||||
env:
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -255,7 +260,6 @@ jobs:
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
|
||||
env:
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -268,7 +272,7 @@ jobs:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: ${{ matrix.platform != 'neon-captest-reuse' && always() }}
|
||||
if: ${{ steps.create-neon-project.outputs.project_id && always() }}
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
environment: dev
|
||||
|
||||
76
.github/workflows/build_and_test.yml
vendored
76
.github/workflows/build_and_test.yml
vendored
@@ -35,12 +35,12 @@ jobs:
|
||||
echo ref:$GITHUB_REF_NAME
|
||||
echo rev:$(git rev-list --count HEAD)
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
echo "::set-output name=tag::$(git rev-list --count HEAD)"
|
||||
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
|
||||
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
echo "::set-output name=tag::$GITHUB_RUN_ID"
|
||||
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
shell: bash
|
||||
id: build-tag
|
||||
@@ -78,12 +78,12 @@ jobs:
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14)
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Set pg 15 revision for caching
|
||||
id: pg_v15_rev
|
||||
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15)
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
# Set some environment variables used by all the steps.
|
||||
@@ -494,7 +494,7 @@ jobs:
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build neon
|
||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID
|
||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: dev
|
||||
@@ -508,7 +508,7 @@ jobs:
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build compute tools
|
||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID
|
||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID
|
||||
|
||||
compute-node-image:
|
||||
runs-on: dev
|
||||
@@ -527,7 +527,7 @@ jobs:
|
||||
# cloud repo depends on this image name, thus duplicating it
|
||||
# remove compute-node when cloud repo is updated
|
||||
- name: Kaniko build compute node with extensions v14 (compatibility)
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
|
||||
|
||||
compute-node-image-v14:
|
||||
runs-on: dev
|
||||
@@ -543,7 +543,7 @@ jobs:
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build compute node with extensions v14
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID
|
||||
|
||||
|
||||
compute-node-image-v15:
|
||||
@@ -560,7 +560,7 @@ jobs:
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build compute node with extensions v15
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID
|
||||
|
||||
promote-images:
|
||||
runs-on: dev
|
||||
@@ -622,6 +622,8 @@ jobs:
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
|
||||
|
||||
- name: Configure Docker Hub login
|
||||
run: |
|
||||
@@ -669,12 +671,12 @@ jobs:
|
||||
- id: set-matrix
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}'
|
||||
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}'
|
||||
echo "::set-output name=include::[$STAGING, $NEON_STRESS]"
|
||||
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
|
||||
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}'
|
||||
echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}'
|
||||
echo "::set-output name=include::[$PRODUCTION]"
|
||||
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
|
||||
echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
@@ -710,7 +712,7 @@ jobs:
|
||||
- name: Setup ansible
|
||||
run: |
|
||||
export PATH="/root/.local/bin:$PATH"
|
||||
pip install --progress-bar off --user ansible boto3
|
||||
pip install --progress-bar off --user ansible boto3 toml
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
@@ -732,8 +734,48 @@ jobs:
|
||||
chmod 0600 ssh-key
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts
|
||||
deploy-new:
|
||||
runs-on: dev
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'main') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
cd "$(pwd)/.github/ansible"
|
||||
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
./get_binaries.sh
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
RELEASE=true ./get_binaries.sh
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-proxy:
|
||||
|
||||
6
.github/workflows/codestyle.yml
vendored
6
.github/workflows/codestyle.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
@@ -56,12 +56,12 @@ jobs:
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14)
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Set pg 15 revision for caching
|
||||
id: pg_v15_rev
|
||||
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15)
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
|
||||
888
Cargo.lock
generated
888
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
11
Cargo.toml
11
Cargo.toml
@@ -1,3 +1,14 @@
|
||||
# 'named-profiles' feature was stabilized in cargo 1.57. This line makes the
|
||||
# build work with older cargo versions.
|
||||
#
|
||||
# We have this because as of this writing, the latest cargo Debian package
|
||||
# that's available is 1.56. (Confusingly, the Debian package version number
|
||||
# is 0.57, whereas 'cargo --version' says 1.56.)
|
||||
#
|
||||
# See https://tracker.debian.org/pkg/cargo for the current status of the
|
||||
# package. When that gets updated, we can remove this.
|
||||
cargo-features = ["named-profiles"]
|
||||
|
||||
[workspace]
|
||||
members = [
|
||||
"compute_tools",
|
||||
|
||||
10
Dockerfile
10
Dockerfile
@@ -44,7 +44,7 @@ COPY . .
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
&& mold -run cargo build --bin pageserver --bin safekeeper --bin proxy --locked --release \
|
||||
&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
# Build final image
|
||||
@@ -63,9 +63,10 @@ RUN set -e \
|
||||
&& useradd -d /data neon \
|
||||
&& chown -R neon:neon /data
|
||||
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
@@ -85,4 +86,3 @@ VOLUME ["/data"]
|
||||
USER neon
|
||||
EXPOSE 6400
|
||||
EXPOSE 9898
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@@ -71,10 +71,12 @@ RUN apt update && \
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends -t testing binutils
|
||||
|
||||
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||
tar xvzf v3.1.4.tar.gz && \
|
||||
cd plv8-3.1.4 && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
@@ -116,8 +118,7 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
|
||||
#
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
# plv8 still sometimes crashes during the creation
|
||||
# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /h3/usr /
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
@@ -76,10 +76,12 @@ RUN apt update && \
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends -t testing binutils
|
||||
|
||||
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||
tar xvzf v3.1.4.tar.gz && \
|
||||
cd plv8-3.1.4 && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
@@ -121,8 +123,7 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
|
||||
#
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
# plv8 still sometimes crashes during the creation
|
||||
# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /h3/usr /
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
@@ -6,7 +6,7 @@ edition = "2021"
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
chrono = "0.4"
|
||||
clap = "3.0"
|
||||
clap = "4.0"
|
||||
env_logger = "0.9"
|
||||
futures = "0.3.13"
|
||||
hyper = { version = "0.14", features = ["full"] }
|
||||
|
||||
@@ -51,53 +51,19 @@ fn main() -> Result<()> {
|
||||
// TODO: re-use `utils::logging` later
|
||||
init_logger(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
// Env variable is set by `cargo`
|
||||
let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
|
||||
let matches = clap::App::new("compute_ctl")
|
||||
.version(version.unwrap_or("unknown"))
|
||||
.arg(
|
||||
Arg::new("connstr")
|
||||
.short('C')
|
||||
.long("connstr")
|
||||
.value_name("DATABASE_URL")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgdata")
|
||||
.short('D')
|
||||
.long("pgdata")
|
||||
.value_name("DATADIR")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbin")
|
||||
.short('b')
|
||||
.long("pgbin")
|
||||
.value_name("POSTGRES_PATH"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec")
|
||||
.short('s')
|
||||
.long("spec")
|
||||
.value_name("SPEC_JSON"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec-path")
|
||||
.short('S')
|
||||
.long("spec-path")
|
||||
.value_name("SPEC_PATH"),
|
||||
)
|
||||
.get_matches();
|
||||
let matches = cli().get_matches();
|
||||
|
||||
let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
|
||||
let pgdata = matches
|
||||
.get_one::<String>("pgdata")
|
||||
.expect("PGDATA path is required");
|
||||
let connstr = matches
|
||||
.value_of("connstr")
|
||||
.get_one::<String>("connstr")
|
||||
.expect("Postgres connection string is required");
|
||||
let spec = matches.value_of("spec");
|
||||
let spec_path = matches.value_of("spec-path");
|
||||
let spec = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap();
|
||||
|
||||
let spec: ComputeSpec = match spec {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
@@ -173,3 +139,48 @@ fn main() -> Result<()> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn cli() -> clap::Command {
|
||||
// Env variable is set by `cargo`
|
||||
let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
|
||||
clap::Command::new("compute_ctl")
|
||||
.version(version)
|
||||
.arg(
|
||||
Arg::new("connstr")
|
||||
.short('C')
|
||||
.long("connstr")
|
||||
.value_name("DATABASE_URL")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgdata")
|
||||
.short('D')
|
||||
.long("pgdata")
|
||||
.value_name("DATADIR")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbin")
|
||||
.short('b')
|
||||
.long("pgbin")
|
||||
.default_value("postgres")
|
||||
.value_name("POSTGRES_PATH"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec")
|
||||
.short('s')
|
||||
.long("spec")
|
||||
.value_name("SPEC_JSON"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec-path")
|
||||
.short('S')
|
||||
.long("spec-path")
|
||||
.value_name("SPEC_PATH"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_cli() {
|
||||
cli().debug_assert()
|
||||
}
|
||||
|
||||
@@ -8,11 +8,10 @@ use std::process::Child;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use notify::{RecursiveMode, Watcher};
|
||||
use postgres::{Client, Transaction};
|
||||
use serde::Deserialize;
|
||||
|
||||
use notify::{RecursiveMode, Watcher};
|
||||
|
||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||
|
||||
/// Rust representation of Postgres role info with only those fields
|
||||
@@ -169,7 +168,7 @@ impl Database {
|
||||
/// it may require a proper quoting too.
|
||||
pub fn to_pg_options(&self) -> String {
|
||||
let mut params: String = self.options.as_pg_options();
|
||||
write!(params, " OWNER {}", &self.owner.quote())
|
||||
write!(params, " OWNER {}", &self.owner.pg_quote())
|
||||
.expect("String is documented to not to error during write operations");
|
||||
|
||||
params
|
||||
@@ -180,18 +179,17 @@ impl Database {
|
||||
/// intended to be used for DB / role names.
|
||||
pub type PgIdent = String;
|
||||
|
||||
/// Generic trait used to provide quoting for strings used in the
|
||||
/// Postgres SQL queries. Currently used only to implement quoting
|
||||
/// of identifiers, but could be used for literals in the future.
|
||||
pub trait PgQuote {
|
||||
fn quote(&self) -> String;
|
||||
/// Generic trait used to provide quoting / encoding for strings used in the
|
||||
/// Postgres SQL queries and DATABASE_URL.
|
||||
pub trait Escaping {
|
||||
fn pg_quote(&self) -> String;
|
||||
}
|
||||
|
||||
impl PgQuote for PgIdent {
|
||||
impl Escaping for PgIdent {
|
||||
/// This is intended to mimic Postgres quote_ident(), but for simplicity it
|
||||
/// always quotes provided string with `""` and escapes every `"`. Not idempotent,
|
||||
/// i.e. if string is already escaped it will be escaped again.
|
||||
fn quote(&self) -> String {
|
||||
/// always quotes provided string with `""` and escapes every `"`.
|
||||
/// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
|
||||
fn pg_quote(&self) -> String {
|
||||
let result = format!("\"{}\"", self.replace('"', "\"\""));
|
||||
result
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Result;
|
||||
use log::{info, log_enabled, warn, Level};
|
||||
use postgres::config::Config;
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::Deserialize;
|
||||
|
||||
@@ -115,8 +117,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
if existing_roles.iter().any(|r| r.name == op.name) {
|
||||
let query: String = format!(
|
||||
"ALTER ROLE {} RENAME TO {}",
|
||||
op.name.quote(),
|
||||
new_name.quote()
|
||||
op.name.pg_quote(),
|
||||
new_name.pg_quote()
|
||||
);
|
||||
|
||||
warn!("renaming role '{}' to '{}'", op.name, new_name);
|
||||
@@ -162,7 +164,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
}
|
||||
|
||||
if update_role {
|
||||
let mut query: String = format!("ALTER ROLE {} ", name.quote());
|
||||
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
|
||||
info_print!(" -> update");
|
||||
|
||||
query.push_str(&role.to_pg_options());
|
||||
@@ -170,7 +172,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
}
|
||||
} else {
|
||||
info!("role name: '{}'", &name);
|
||||
let mut query: String = format!("CREATE ROLE {} ", name.quote());
|
||||
let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
|
||||
info!("role create query: '{}'", &query);
|
||||
info_print!(" -> create");
|
||||
|
||||
@@ -179,7 +181,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
|
||||
let grant_query = format!(
|
||||
"GRANT pg_read_all_data, pg_write_all_data TO {}",
|
||||
name.quote()
|
||||
name.pg_quote()
|
||||
);
|
||||
xact.execute(grant_query.as_str(), &[])?;
|
||||
info!("role grant query: '{}'", &grant_query);
|
||||
@@ -215,7 +217,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
|
||||
// We do not check either role exists or not,
|
||||
// Postgres will take care of it for us
|
||||
if op.action == "delete_role" {
|
||||
let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
|
||||
let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.pg_quote());
|
||||
|
||||
warn!("deleting role '{}'", &op.name);
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
@@ -230,17 +232,16 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
|
||||
fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
|
||||
for db in &node.spec.cluster.databases {
|
||||
if db.owner != *role_name {
|
||||
let mut connstr = node.connstr.clone();
|
||||
// database name is always the last and the only component of the path
|
||||
connstr.set_path(&db.name);
|
||||
let mut conf = Config::from_str(node.connstr.as_str())?;
|
||||
conf.dbname(&db.name);
|
||||
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
let mut client = conf.connect(NoTls)?;
|
||||
|
||||
// This will reassign all dependent objects to the db owner
|
||||
let reassign_query = format!(
|
||||
"REASSIGN OWNED BY {} TO {}",
|
||||
role_name.quote(),
|
||||
db.owner.quote()
|
||||
role_name.pg_quote(),
|
||||
db.owner.pg_quote()
|
||||
);
|
||||
info!(
|
||||
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||
@@ -249,7 +250,7 @@ fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()>
|
||||
client.simple_query(&reassign_query)?;
|
||||
|
||||
// This now will only drop privileges of the role
|
||||
let drop_query = format!("DROP OWNED BY {}", role_name.quote());
|
||||
let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
|
||||
client.simple_query(&drop_query)?;
|
||||
}
|
||||
}
|
||||
@@ -279,7 +280,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
// We do not check either DB exists or not,
|
||||
// Postgres will take care of it for us
|
||||
"delete_db" => {
|
||||
let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.quote());
|
||||
let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote());
|
||||
|
||||
warn!("deleting database '{}'", &op.name);
|
||||
client.execute(query.as_str(), &[])?;
|
||||
@@ -291,8 +292,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
if existing_dbs.iter().any(|r| r.name == op.name) {
|
||||
let query: String = format!(
|
||||
"ALTER DATABASE {} RENAME TO {}",
|
||||
op.name.quote(),
|
||||
new_name.quote()
|
||||
op.name.pg_quote(),
|
||||
new_name.pg_quote()
|
||||
);
|
||||
|
||||
warn!("renaming database '{}' to '{}'", op.name, new_name);
|
||||
@@ -320,7 +321,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
// XXX: db owner name is returned as quoted string from Postgres,
|
||||
// when quoting is needed.
|
||||
let new_owner = if r.owner.starts_with('"') {
|
||||
db.owner.quote()
|
||||
db.owner.pg_quote()
|
||||
} else {
|
||||
db.owner.clone()
|
||||
};
|
||||
@@ -328,15 +329,15 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
if new_owner != r.owner {
|
||||
let query: String = format!(
|
||||
"ALTER DATABASE {} OWNER TO {}",
|
||||
name.quote(),
|
||||
db.owner.quote()
|
||||
name.pg_quote(),
|
||||
db.owner.pg_quote()
|
||||
);
|
||||
info_print!(" -> update");
|
||||
|
||||
client.execute(query.as_str(), &[])?;
|
||||
}
|
||||
} else {
|
||||
let mut query: String = format!("CREATE DATABASE {} ", name.quote());
|
||||
let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
|
||||
info_print!(" -> create");
|
||||
|
||||
query.push_str(&db.to_pg_options());
|
||||
@@ -366,7 +367,7 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
.cluster
|
||||
.roles
|
||||
.iter()
|
||||
.map(|r| r.name.quote())
|
||||
.map(|r| r.name.pg_quote())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for db in &spec.cluster.databases {
|
||||
@@ -374,7 +375,7 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
|
||||
let query: String = format!(
|
||||
"GRANT CREATE ON DATABASE {} TO {}",
|
||||
dbname.quote(),
|
||||
dbname.pg_quote(),
|
||||
roles.join(", ")
|
||||
);
|
||||
info!("grant query {}", &query);
|
||||
@@ -385,12 +386,11 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
// Do some per-database access adjustments. We'd better do this at db creation time,
|
||||
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
|
||||
// atomically.
|
||||
let mut db_connstr = node.connstr.clone();
|
||||
for db in &node.spec.cluster.databases {
|
||||
// database name is always the last and the only component of the path
|
||||
db_connstr.set_path(&db.name);
|
||||
let mut conf = Config::from_str(node.connstr.as_str())?;
|
||||
conf.dbname(&db.name);
|
||||
|
||||
let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?;
|
||||
let mut db_client = conf.connect(NoTls)?;
|
||||
|
||||
// This will only change ownership on the schema itself, not the objects
|
||||
// inside it. Without it owner of the `public` schema will be `cloud_admin`
|
||||
@@ -419,9 +419,15 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
END IF;\n\
|
||||
END\n\
|
||||
$$;",
|
||||
db.owner.quote()
|
||||
db.owner.pg_quote()
|
||||
);
|
||||
db_client.simple_query(&alter_query)?;
|
||||
|
||||
// Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
|
||||
// This is needed since postgres 15, where this privilege is removed by default.
|
||||
let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string();
|
||||
info!("grant query for db {} : {}", &db.name, &grant_query);
|
||||
db_client.simple_query(&grant_query)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -33,9 +33,9 @@ mod pg_helpers_tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn quote_ident() {
|
||||
fn ident_pg_quote() {
|
||||
let ident: PgIdent = PgIdent::from("\"name\";\\n select 1;");
|
||||
|
||||
assert_eq!(ident.quote(), "\"\"\"name\"\";\\n select 1;\"");
|
||||
assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,19 +4,19 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
clap = "3.0"
|
||||
comfy-table = "5.0.1"
|
||||
clap = "4.0"
|
||||
comfy-table = "6.1"
|
||||
git-version = "0.3.5"
|
||||
tar = "0.4.38"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
serde_with = "2.0"
|
||||
toml = "0.5"
|
||||
once_cell = "1.13.0"
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
thiserror = "1"
|
||||
nix = "0.23"
|
||||
nix = "0.25"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
|
||||
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
//! rely on `neon_local` to set up the environment for each test.
|
||||
//!
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{App, AppSettings, Arg, ArgMatches};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use control_plane::compute::ComputeControlPlane;
|
||||
use control_plane::local_env::{EtcdBroker, LocalEnv};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
@@ -85,212 +85,7 @@ struct TimelineTreeEl {
|
||||
// * Providing CLI api to the pageserver
|
||||
// * TODO: export/import to/from usual postgres
|
||||
fn main() -> Result<()> {
|
||||
let branch_name_arg = Arg::new("branch-name")
|
||||
.long("branch-name")
|
||||
.takes_value(true)
|
||||
.help("Name of the branch to be created or used as an alias for other services")
|
||||
.required(false);
|
||||
|
||||
let pg_node_arg = Arg::new("node").help("Postgres node name").required(false);
|
||||
|
||||
let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
|
||||
|
||||
let tenant_id_arg = Arg::new("tenant-id")
|
||||
.long("tenant-id")
|
||||
.help("Tenant id. Represented as a hexadecimal string 32 symbols length")
|
||||
.takes_value(true)
|
||||
.required(false);
|
||||
|
||||
let timeline_id_arg = Arg::new("timeline-id")
|
||||
.long("timeline-id")
|
||||
.help("Timeline id. Represented as a hexadecimal string 32 symbols length")
|
||||
.takes_value(true)
|
||||
.required(false);
|
||||
|
||||
let pg_version_arg = Arg::new("pg-version")
|
||||
.long("pg-version")
|
||||
.help("Postgres version to use for the initial tenant")
|
||||
.required(false)
|
||||
.takes_value(true)
|
||||
.default_value(DEFAULT_PG_VERSION);
|
||||
|
||||
let port_arg = Arg::new("port")
|
||||
.long("port")
|
||||
.required(false)
|
||||
.value_name("port");
|
||||
|
||||
let stop_mode_arg = Arg::new("stop-mode")
|
||||
.short('m')
|
||||
.takes_value(true)
|
||||
.possible_values(&["fast", "immediate"])
|
||||
.help("If 'immediate', don't flush repository data at shutdown")
|
||||
.required(false)
|
||||
.value_name("stop-mode");
|
||||
|
||||
let pageserver_config_args = Arg::new("pageserver-config-override")
|
||||
.long("pageserver-config-override")
|
||||
.takes_value(true)
|
||||
.number_of_values(1)
|
||||
.multiple_occurrences(true)
|
||||
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
|
||||
.required(false);
|
||||
|
||||
let lsn_arg = Arg::new("lsn")
|
||||
.long("lsn")
|
||||
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
|
||||
.takes_value(true)
|
||||
.required(false);
|
||||
|
||||
let matches = App::new("Neon CLI")
|
||||
.setting(AppSettings::ArgRequiredElseHelp)
|
||||
.version(GIT_VERSION)
|
||||
.subcommand(
|
||||
App::new("init")
|
||||
.about("Initialize a new Neon repository")
|
||||
.arg(pageserver_config_args.clone())
|
||||
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
|
||||
.arg(
|
||||
Arg::new("config")
|
||||
.long("config")
|
||||
.required(false)
|
||||
.value_name("config"),
|
||||
)
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(
|
||||
App::new("timeline")
|
||||
.about("Manage timelines")
|
||||
.subcommand(App::new("list")
|
||||
.about("List all timelines, available to this pageserver")
|
||||
.arg(tenant_id_arg.clone()))
|
||||
.subcommand(App::new("branch")
|
||||
.about("Create a new timeline, using another timeline as a base, copying its data")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name").takes_value(true)
|
||||
.help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
|
||||
.arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn").takes_value(true)
|
||||
.help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false)))
|
||||
.subcommand(App::new("create")
|
||||
.about("Create a new blank timeline")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(App::new("import")
|
||||
.about("Import timeline from basebackup directory")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(timeline_id_arg.clone())
|
||||
.arg(Arg::new("node-name").long("node-name").takes_value(true)
|
||||
.help("Name to assign to the imported timeline"))
|
||||
.arg(Arg::new("base-tarfile").long("base-tarfile").takes_value(true)
|
||||
.help("Basebackup tarfile to import"))
|
||||
.arg(Arg::new("base-lsn").long("base-lsn").takes_value(true)
|
||||
.help("Lsn the basebackup starts at"))
|
||||
.arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true)
|
||||
.help("Wal to add after base"))
|
||||
.arg(Arg::new("end-lsn").long("end-lsn").takes_value(true)
|
||||
.help("Lsn the basebackup ends at"))
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
).subcommand(
|
||||
App::new("tenant")
|
||||
.setting(AppSettings::ArgRequiredElseHelp)
|
||||
.about("Manage tenants")
|
||||
.subcommand(App::new("list"))
|
||||
.subcommand(App::new("create")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
|
||||
.arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(App::new("config")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
App::new("pageserver")
|
||||
.setting(AppSettings::ArgRequiredElseHelp)
|
||||
.about("Manage pageserver")
|
||||
.subcommand(App::new("status"))
|
||||
.subcommand(App::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
||||
.subcommand(App::new("stop").about("Stop local pageserver")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
.subcommand(App::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
|
||||
)
|
||||
.subcommand(
|
||||
App::new("safekeeper")
|
||||
.setting(AppSettings::ArgRequiredElseHelp)
|
||||
.about("Manage safekeepers")
|
||||
.subcommand(App::new("start")
|
||||
.about("Start local safekeeper")
|
||||
.arg(safekeeper_id_arg.clone())
|
||||
)
|
||||
.subcommand(App::new("stop")
|
||||
.about("Stop local safekeeper")
|
||||
.arg(safekeeper_id_arg.clone())
|
||||
.arg(stop_mode_arg.clone())
|
||||
)
|
||||
.subcommand(App::new("restart")
|
||||
.about("Restart local safekeeper")
|
||||
.arg(safekeeper_id_arg.clone())
|
||||
.arg(stop_mode_arg.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
App::new("pg")
|
||||
.setting(AppSettings::ArgRequiredElseHelp)
|
||||
.about("Manage postgres instances")
|
||||
.subcommand(App::new("list").arg(tenant_id_arg.clone()))
|
||||
.subcommand(App::new("create")
|
||||
.about("Create a postgres compute node")
|
||||
.arg(pg_node_arg.clone())
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(lsn_arg.clone())
|
||||
.arg(port_arg.clone())
|
||||
.arg(
|
||||
Arg::new("config-only")
|
||||
.help("Don't do basebackup, create compute node with only config files")
|
||||
.long("config-only")
|
||||
.required(false))
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(App::new("start")
|
||||
.about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
|
||||
.arg(pg_node_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(timeline_id_arg.clone())
|
||||
.arg(lsn_arg.clone())
|
||||
.arg(port_arg.clone())
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(
|
||||
App::new("stop")
|
||||
.arg(pg_node_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(
|
||||
Arg::new("destroy")
|
||||
.help("Also delete data directory (now optional, should be default in future)")
|
||||
.long("destroy")
|
||||
.required(false)
|
||||
)
|
||||
)
|
||||
|
||||
)
|
||||
.subcommand(
|
||||
App::new("start")
|
||||
.about("Start page server and safekeepers")
|
||||
.arg(pageserver_config_args)
|
||||
)
|
||||
.subcommand(
|
||||
App::new("stop")
|
||||
.about("Stop page server and safekeepers")
|
||||
.arg(stop_mode_arg.clone())
|
||||
)
|
||||
.get_matches();
|
||||
let matches = cli().get_matches();
|
||||
|
||||
let (sub_name, sub_args) = match matches.subcommand() {
|
||||
Some(subcommand_data) => subcommand_data,
|
||||
@@ -358,9 +153,7 @@ fn print_timelines_tree(
|
||||
|
||||
// Memorize all direct children of each timeline.
|
||||
for timeline in timelines.iter() {
|
||||
if let Some(ancestor_timeline_id) =
|
||||
timeline.local.as_ref().and_then(|l| l.ancestor_timeline_id)
|
||||
{
|
||||
if let Some(ancestor_timeline_id) = timeline.ancestor_timeline_id {
|
||||
timelines_hash
|
||||
.get_mut(&ancestor_timeline_id)
|
||||
.context("missing timeline info in the HashMap")?
|
||||
@@ -371,13 +164,7 @@ fn print_timelines_tree(
|
||||
|
||||
for timeline in timelines_hash.values() {
|
||||
// Start with root local timelines (no ancestors) first.
|
||||
if timeline
|
||||
.info
|
||||
.local
|
||||
.as_ref()
|
||||
.and_then(|l| l.ancestor_timeline_id)
|
||||
.is_none()
|
||||
{
|
||||
if timeline.info.ancestor_timeline_id.is_none() {
|
||||
print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?;
|
||||
}
|
||||
}
|
||||
@@ -394,17 +181,8 @@ fn print_timeline(
|
||||
timeline: &TimelineTreeEl,
|
||||
timelines: &HashMap<TimelineId, TimelineTreeEl>,
|
||||
) -> Result<()> {
|
||||
let local_remote = match (timeline.info.local.as_ref(), timeline.info.remote.as_ref()) {
|
||||
(None, None) => unreachable!("in this case no info for a timeline is found"),
|
||||
(None, Some(_)) => "(R)",
|
||||
(Some(_), None) => "(L)",
|
||||
(Some(_), Some(_)) => "(L+R)",
|
||||
};
|
||||
// Draw main padding
|
||||
print!("{} ", local_remote);
|
||||
|
||||
if nesting_level > 0 {
|
||||
let ancestor_lsn = match timeline.info.local.as_ref().and_then(|i| i.ancestor_lsn) {
|
||||
let ancestor_lsn = match timeline.info.ancestor_lsn {
|
||||
Some(lsn) => lsn.to_string(),
|
||||
None => "Unknown Lsn".to_string(),
|
||||
};
|
||||
@@ -492,16 +270,16 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
|
||||
|
||||
fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
|
||||
sub_match
|
||||
.value_of("tenant-id")
|
||||
.map(TenantId::from_str)
|
||||
.get_one::<String>("tenant-id")
|
||||
.map(|tenant_id| TenantId::from_str(tenant_id))
|
||||
.transpose()
|
||||
.context("Failed to parse tenant id from the argument string")
|
||||
}
|
||||
|
||||
fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
|
||||
sub_match
|
||||
.value_of("timeline-id")
|
||||
.map(TimelineId::from_str)
|
||||
.get_one::<String>("timeline-id")
|
||||
.map(|timeline_id| TimelineId::from_str(timeline_id))
|
||||
.transpose()
|
||||
.context("Failed to parse timeline id from the argument string")
|
||||
}
|
||||
@@ -510,19 +288,22 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
let initial_timeline_id_arg = parse_timeline_id(init_match)?;
|
||||
|
||||
// Create config file
|
||||
let toml_file: String = if let Some(config_path) = init_match.value_of("config") {
|
||||
let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
|
||||
// load and parse the file
|
||||
std::fs::read_to_string(std::path::Path::new(config_path))
|
||||
.with_context(|| format!("Could not read configuration file '{config_path}'"))?
|
||||
std::fs::read_to_string(config_path).with_context(|| {
|
||||
format!(
|
||||
"Could not read configuration file '{}'",
|
||||
config_path.display()
|
||||
)
|
||||
})?
|
||||
} else {
|
||||
// Built-in default config
|
||||
default_conf(&EtcdBroker::locate_etcd()?)
|
||||
};
|
||||
|
||||
let pg_version = init_match
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let mut env =
|
||||
@@ -558,9 +339,10 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
|
||||
fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
|
||||
init_match
|
||||
.values_of("pageserver-config-override")
|
||||
.get_many::<String>("pageserver-config-override")
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(|s| s.as_str())
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -575,7 +357,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
Some(("create", create_match)) => {
|
||||
let initial_tenant_id = parse_tenant_id(create_match)?;
|
||||
let tenant_conf: HashMap<_, _> = create_match
|
||||
.values_of("config")
|
||||
.get_many::<String>("config")
|
||||
.map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
|
||||
.unwrap_or_default();
|
||||
let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?;
|
||||
@@ -584,9 +366,8 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
// Create an initial timeline for the new tenant
|
||||
let new_timeline_id = parse_timeline_id(create_match)?;
|
||||
let pg_version = create_match
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let timeline_info = pageserver.timeline_create(
|
||||
@@ -597,10 +378,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
Some(pg_version),
|
||||
)?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
let last_record_lsn = timeline_info
|
||||
.local
|
||||
.context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))?
|
||||
.last_record_lsn;
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
|
||||
env.register_branch_mapping(
|
||||
DEFAULT_BRANCH_NAME.to_string(),
|
||||
@@ -615,7 +393,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
Some(("config", create_match)) => {
|
||||
let tenant_id = get_tenant_id(create_match, env)?;
|
||||
let tenant_conf: HashMap<_, _> = create_match
|
||||
.values_of("config")
|
||||
.get_many::<String>("config")
|
||||
.map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
@@ -642,23 +420,19 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
Some(("create", create_match)) => {
|
||||
let tenant_id = get_tenant_id(create_match, env)?;
|
||||
let new_branch_name = create_match
|
||||
.value_of("branch-name")
|
||||
.get_one::<String>("branch-name")
|
||||
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
||||
|
||||
let pg_version = create_match
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let timeline_info =
|
||||
pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
|
||||
let last_record_lsn = timeline_info
|
||||
.local
|
||||
.expect("no local timeline info")
|
||||
.last_record_lsn;
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
|
||||
|
||||
println!(
|
||||
@@ -670,35 +444,32 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
let tenant_id = get_tenant_id(import_match, env)?;
|
||||
let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided");
|
||||
let name = import_match
|
||||
.value_of("node-name")
|
||||
.get_one::<String>("node-name")
|
||||
.ok_or_else(|| anyhow!("No node name provided"))?;
|
||||
|
||||
// Parse base inputs
|
||||
let base_tarfile = import_match
|
||||
.value_of("base-tarfile")
|
||||
.map(|s| PathBuf::from_str(s).unwrap())
|
||||
.ok_or_else(|| anyhow!("No base-tarfile provided"))?;
|
||||
.get_one::<PathBuf>("base-tarfile")
|
||||
.ok_or_else(|| anyhow!("No base-tarfile provided"))?
|
||||
.to_owned();
|
||||
let base_lsn = Lsn::from_str(
|
||||
import_match
|
||||
.value_of("base-lsn")
|
||||
.get_one::<String>("base-lsn")
|
||||
.ok_or_else(|| anyhow!("No base-lsn provided"))?,
|
||||
)?;
|
||||
let base = (base_lsn, base_tarfile);
|
||||
|
||||
// Parse pg_wal inputs
|
||||
let wal_tarfile = import_match
|
||||
.value_of("wal-tarfile")
|
||||
.map(|s| PathBuf::from_str(s).unwrap());
|
||||
let wal_tarfile = import_match.get_one::<PathBuf>("wal-tarfile").cloned();
|
||||
let end_lsn = import_match
|
||||
.value_of("end-lsn")
|
||||
.get_one::<String>("end-lsn")
|
||||
.map(|s| Lsn::from_str(s).unwrap());
|
||||
// TODO validate both or none are provided
|
||||
let pg_wal = end_lsn.zip(wal_tarfile);
|
||||
|
||||
let pg_version = import_match
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||
@@ -713,10 +484,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
Some(("branch", branch_match)) => {
|
||||
let tenant_id = get_tenant_id(branch_match, env)?;
|
||||
let new_branch_name = branch_match
|
||||
.value_of("branch-name")
|
||||
.get_one::<String>("branch-name")
|
||||
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
||||
let ancestor_branch_name = branch_match
|
||||
.value_of("ancestor-branch-name")
|
||||
.get_one::<String>("ancestor-branch-name")
|
||||
.map(|s| s.as_str())
|
||||
.unwrap_or(DEFAULT_BRANCH_NAME);
|
||||
let ancestor_timeline_id = env
|
||||
.get_branch_timeline_id(ancestor_branch_name, tenant_id)
|
||||
@@ -725,8 +497,8 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
})?;
|
||||
|
||||
let start_lsn = branch_match
|
||||
.value_of("ancestor-start-lsn")
|
||||
.map(Lsn::from_str)
|
||||
.get_one::<String>("ancestor-start-lsn")
|
||||
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||
.transpose()
|
||||
.context("Failed to parse ancestor start Lsn from the request")?;
|
||||
let timeline_info = pageserver.timeline_create(
|
||||
@@ -738,10 +510,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
)?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
|
||||
let last_record_lsn = timeline_info
|
||||
.local
|
||||
.expect("no local timeline info")
|
||||
.last_record_lsn;
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
|
||||
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
|
||||
|
||||
@@ -801,7 +570,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
// Use the LSN at the end of the timeline.
|
||||
timeline_infos
|
||||
.get(&node.timeline_id)
|
||||
.and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string()))
|
||||
.map(|bi| bi.last_record_lsn.to_string())
|
||||
.unwrap_or_else(|| "?".to_string())
|
||||
}
|
||||
Some(lsn) => {
|
||||
@@ -830,45 +599,39 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
}
|
||||
"create" => {
|
||||
let branch_name = sub_args
|
||||
.value_of("branch-name")
|
||||
.get_one::<String>("branch-name")
|
||||
.map(|s| s.as_str())
|
||||
.unwrap_or(DEFAULT_BRANCH_NAME);
|
||||
let node_name = sub_args
|
||||
.value_of("node")
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or_else(|| format!("{}_node", branch_name));
|
||||
.get_one::<String>("node")
|
||||
.map(|node_name| node_name.to_string())
|
||||
.unwrap_or_else(|| format!("{branch_name}_node"));
|
||||
|
||||
let lsn = sub_args
|
||||
.value_of("lsn")
|
||||
.map(Lsn::from_str)
|
||||
.get_one::<String>("lsn")
|
||||
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||
.transpose()
|
||||
.context("Failed to parse Lsn from the request")?;
|
||||
let timeline_id = env
|
||||
.get_branch_timeline_id(branch_name, tenant_id)
|
||||
.ok_or_else(|| anyhow!("Found no timeline id for branch name '{}'", branch_name))?;
|
||||
.ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;
|
||||
|
||||
let port: Option<u16> = match sub_args.value_of("port") {
|
||||
Some(p) => Some(p.parse()?),
|
||||
None => None,
|
||||
};
|
||||
let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
|
||||
|
||||
let pg_version = sub_args
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?;
|
||||
}
|
||||
"start" => {
|
||||
let port: Option<u16> = match sub_args.value_of("port") {
|
||||
Some(p) => Some(p.parse()?),
|
||||
None => None,
|
||||
};
|
||||
let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
|
||||
let node_name = sub_args
|
||||
.value_of("node")
|
||||
.get_one::<String>("node")
|
||||
.ok_or_else(|| anyhow!("No node name was provided to start"))?;
|
||||
|
||||
let node = cplane.nodes.get(&(tenant_id, node_name.to_owned()));
|
||||
let node = cplane.nodes.get(&(tenant_id, node_name.to_string()));
|
||||
|
||||
let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) {
|
||||
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
|
||||
@@ -879,36 +642,33 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
};
|
||||
|
||||
if let Some(node) = node {
|
||||
println!("Starting existing postgres {}...", node_name);
|
||||
println!("Starting existing postgres {node_name}...");
|
||||
node.start(&auth_token)?;
|
||||
} else {
|
||||
let branch_name = sub_args
|
||||
.value_of("branch-name")
|
||||
.get_one::<String>("branch-name")
|
||||
.map(|s| s.as_str())
|
||||
.unwrap_or(DEFAULT_BRANCH_NAME);
|
||||
let timeline_id = env
|
||||
.get_branch_timeline_id(branch_name, tenant_id)
|
||||
.ok_or_else(|| {
|
||||
anyhow!("Found no timeline id for branch name '{}'", branch_name)
|
||||
anyhow!("Found no timeline id for branch name '{branch_name}'")
|
||||
})?;
|
||||
let lsn = sub_args
|
||||
.value_of("lsn")
|
||||
.map(Lsn::from_str)
|
||||
.get_one::<String>("lsn")
|
||||
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||
.transpose()
|
||||
.context("Failed to parse Lsn from the request")?;
|
||||
let pg_version = sub_args
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
.context("Failed to `pg-version` from the argument string")?;
|
||||
// when used with custom port this results in non obvious behaviour
|
||||
// port is remembered from first start command, i e
|
||||
// start --port X
|
||||
// stop
|
||||
// start <-- will also use port X even without explicit port argument
|
||||
println!(
|
||||
"Starting new postgres (v{}) {} on timeline {} ...",
|
||||
pg_version, node_name, timeline_id
|
||||
);
|
||||
println!("Starting new postgres (v{pg_version}) {node_name} on timeline {timeline_id} ...");
|
||||
|
||||
let node =
|
||||
cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?;
|
||||
@@ -917,18 +677,18 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
}
|
||||
"stop" => {
|
||||
let node_name = sub_args
|
||||
.value_of("node")
|
||||
.get_one::<String>("node")
|
||||
.ok_or_else(|| anyhow!("No node name was provided to stop"))?;
|
||||
let destroy = sub_args.is_present("destroy");
|
||||
let destroy = sub_args.get_flag("destroy");
|
||||
|
||||
let node = cplane
|
||||
.nodes
|
||||
.get(&(tenant_id, node_name.to_owned()))
|
||||
.with_context(|| format!("postgres {} is not found", node_name))?;
|
||||
.get(&(tenant_id, node_name.to_string()))
|
||||
.with_context(|| format!("postgres {node_name} is not found"))?;
|
||||
node.stop(destroy)?;
|
||||
}
|
||||
|
||||
_ => bail!("Unexpected pg subcommand '{}'", sub_name),
|
||||
_ => bail!("Unexpected pg subcommand '{sub_name}'"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -946,7 +706,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
}
|
||||
|
||||
Some(("stop", stop_match)) => {
|
||||
let immediate = stop_match.value_of("stop-mode") == Some("immediate");
|
||||
let immediate = stop_match
|
||||
.get_one::<String>("stop-mode")
|
||||
.map(|s| s.as_str())
|
||||
== Some("immediate");
|
||||
|
||||
if let Err(e) = pageserver.stop(immediate) {
|
||||
eprintln!("pageserver stop failed: {}", e);
|
||||
@@ -996,7 +759,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
};
|
||||
|
||||
// All the commands take an optional safekeeper name argument
|
||||
let sk_id = if let Some(id_str) = sub_args.value_of("id") {
|
||||
let sk_id = if let Some(id_str) = sub_args.get_one::<String>("id") {
|
||||
NodeId(id_str.parse().context("while parsing safekeeper id")?)
|
||||
} else {
|
||||
DEFAULT_SAFEKEEPER_ID
|
||||
@@ -1012,7 +775,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
}
|
||||
|
||||
"stop" => {
|
||||
let immediate = sub_args.value_of("stop-mode") == Some("immediate");
|
||||
let immediate =
|
||||
sub_args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
||||
|
||||
if let Err(e) = safekeeper.stop(immediate) {
|
||||
eprintln!("safekeeper stop failed: {}", e);
|
||||
@@ -1021,7 +785,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
}
|
||||
|
||||
"restart" => {
|
||||
let immediate = sub_args.value_of("stop-mode") == Some("immediate");
|
||||
let immediate =
|
||||
sub_args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
||||
|
||||
if let Err(e) = safekeeper.stop(immediate) {
|
||||
eprintln!("safekeeper stop failed: {}", e);
|
||||
@@ -1065,7 +830,8 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
|
||||
}
|
||||
|
||||
fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let immediate = sub_match.value_of("stop-mode") == Some("immediate");
|
||||
let immediate =
|
||||
sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
||||
|
||||
let pageserver = PageServerNode::from_env(env);
|
||||
|
||||
@@ -1098,3 +864,219 @@ fn try_stop_etcd_process(env: &local_env::LocalEnv) {
|
||||
eprintln!("etcd stop failed: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
fn cli() -> Command {
|
||||
let branch_name_arg = Arg::new("branch-name")
|
||||
.long("branch-name")
|
||||
.help("Name of the branch to be created or used as an alias for other services")
|
||||
.required(false);
|
||||
|
||||
let pg_node_arg = Arg::new("node").help("Postgres node name").required(false);
|
||||
|
||||
let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
|
||||
|
||||
let tenant_id_arg = Arg::new("tenant-id")
|
||||
.long("tenant-id")
|
||||
.help("Tenant id. Represented as a hexadecimal string 32 symbols length")
|
||||
.required(false);
|
||||
|
||||
let timeline_id_arg = Arg::new("timeline-id")
|
||||
.long("timeline-id")
|
||||
.help("Timeline id. Represented as a hexadecimal string 32 symbols length")
|
||||
.required(false);
|
||||
|
||||
let pg_version_arg = Arg::new("pg-version")
|
||||
.long("pg-version")
|
||||
.help("Postgres version to use for the initial tenant")
|
||||
.required(false)
|
||||
.value_parser(value_parser!(u32))
|
||||
.default_value(DEFAULT_PG_VERSION);
|
||||
|
||||
let port_arg = Arg::new("port")
|
||||
.long("port")
|
||||
.required(false)
|
||||
.value_parser(value_parser!(u16))
|
||||
.value_name("port");
|
||||
|
||||
let stop_mode_arg = Arg::new("stop-mode")
|
||||
.short('m')
|
||||
.value_parser(["fast", "immediate"])
|
||||
.help("If 'immediate', don't flush repository data at shutdown")
|
||||
.required(false)
|
||||
.value_name("stop-mode");
|
||||
|
||||
let pageserver_config_args = Arg::new("pageserver-config-override")
|
||||
.long("pageserver-config-override")
|
||||
.num_args(1)
|
||||
.action(ArgAction::Append)
|
||||
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
|
||||
.required(false);
|
||||
|
||||
let lsn_arg = Arg::new("lsn")
|
||||
.long("lsn")
|
||||
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
|
||||
.required(false);
|
||||
|
||||
Command::new("Neon CLI")
|
||||
.arg_required_else_help(true)
|
||||
.version(GIT_VERSION)
|
||||
.subcommand(
|
||||
Command::new("init")
|
||||
.about("Initialize a new Neon repository")
|
||||
.arg(pageserver_config_args.clone())
|
||||
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
|
||||
.arg(
|
||||
Arg::new("config")
|
||||
.long("config")
|
||||
.required(false)
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.value_name("config"),
|
||||
)
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("timeline")
|
||||
.about("Manage timelines")
|
||||
.subcommand(Command::new("list")
|
||||
.about("List all timelines, available to this pageserver")
|
||||
.arg(tenant_id_arg.clone()))
|
||||
.subcommand(Command::new("branch")
|
||||
.about("Create a new timeline, using another timeline as a base, copying its data")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
|
||||
.help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
|
||||
.arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn")
|
||||
.help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false)))
|
||||
.subcommand(Command::new("create")
|
||||
.about("Create a new blank timeline")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("import")
|
||||
.about("Import timeline from basebackup directory")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(timeline_id_arg.clone())
|
||||
.arg(Arg::new("node-name").long("node-name")
|
||||
.help("Name to assign to the imported timeline"))
|
||||
.arg(Arg::new("base-tarfile")
|
||||
.long("base-tarfile")
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.help("Basebackup tarfile to import")
|
||||
)
|
||||
.arg(Arg::new("base-lsn").long("base-lsn")
|
||||
.help("Lsn the basebackup starts at"))
|
||||
.arg(Arg::new("wal-tarfile")
|
||||
.long("wal-tarfile")
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.help("Wal to add after base")
|
||||
)
|
||||
.arg(Arg::new("end-lsn").long("end-lsn")
|
||||
.help("Lsn the basebackup ends at"))
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
).subcommand(
|
||||
Command::new("tenant")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage tenants")
|
||||
.subcommand(Command::new("list"))
|
||||
.subcommand(Command::new("create")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
|
||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("config")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("pageserver")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage pageserver")
|
||||
.subcommand(Command::new("status"))
|
||||
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
||||
.subcommand(Command::new("stop").about("Stop local pageserver")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
.subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("safekeeper")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage safekeepers")
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local safekeeper")
|
||||
.arg(safekeeper_id_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local safekeeper")
|
||||
.arg(safekeeper_id_arg.clone())
|
||||
.arg(stop_mode_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("restart")
|
||||
.about("Restart local safekeeper")
|
||||
.arg(safekeeper_id_arg)
|
||||
.arg(stop_mode_arg.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("pg")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage postgres instances")
|
||||
.subcommand(Command::new("list").arg(tenant_id_arg.clone()))
|
||||
.subcommand(Command::new("create")
|
||||
.about("Create a postgres compute node")
|
||||
.arg(pg_node_arg.clone())
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(lsn_arg.clone())
|
||||
.arg(port_arg.clone())
|
||||
.arg(
|
||||
Arg::new("config-only")
|
||||
.help("Don't do basebackup, create compute node with only config files")
|
||||
.long("config-only")
|
||||
.required(false))
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
|
||||
.arg(pg_node_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(branch_name_arg)
|
||||
.arg(timeline_id_arg)
|
||||
.arg(lsn_arg)
|
||||
.arg(port_arg)
|
||||
.arg(pg_version_arg)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
.arg(pg_node_arg)
|
||||
.arg(tenant_id_arg)
|
||||
.arg(
|
||||
Arg::new("destroy")
|
||||
.help("Also delete data directory (now optional, should be default in future)")
|
||||
.long("destroy")
|
||||
.action(ArgAction::SetTrue)
|
||||
.required(false)
|
||||
)
|
||||
)
|
||||
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("start")
|
||||
.about("Start page server and safekeepers")
|
||||
.arg(pageserver_config_args)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
.about("Stop page server and safekeepers")
|
||||
.arg(stop_mode_arg)
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_cli() {
|
||||
cli().debug_assert();
|
||||
}
|
||||
|
||||
@@ -12,13 +12,8 @@ use nix::unistd::Pid;
|
||||
use postgres::Config;
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use safekeeper_api::models::TimelineCreateRequest;
|
||||
use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
};
|
||||
use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};
|
||||
|
||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||
use crate::storage::PageServerNode;
|
||||
@@ -281,24 +276,4 @@ impl SafekeeperNode {
|
||||
.error_from_body()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn timeline_create(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
peer_ids: Vec<NodeId>,
|
||||
) -> Result<()> {
|
||||
Ok(self
|
||||
.http_request(
|
||||
Method::POST,
|
||||
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
|
||||
)
|
||||
.json(&TimelineCreateRequest {
|
||||
timeline_id,
|
||||
peer_ids,
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json()?)
|
||||
}
|
||||
}
|
||||
|
||||
163
docs/rfcs/018-storage-messaging-2.md
Normal file
163
docs/rfcs/018-storage-messaging-2.md
Normal file
@@ -0,0 +1,163 @@
|
||||
# Storage messaging
|
||||
|
||||
Safekeepers need to communicate to each other to
|
||||
* Trim WAL on safekeepers;
|
||||
* Decide on which SK should push WAL to the S3;
|
||||
* Decide on when to shut down SK<->pageserver connection;
|
||||
* Understand state of each other to perform peer recovery;
|
||||
|
||||
Pageservers need to communicate to safekeepers to decide which SK should provide
|
||||
WAL to the pageserver.
|
||||
|
||||
This is an iteration on [015-storage-messaging](https://github.com/neondatabase/neon/blob/main/docs/rfcs/015-storage-messaging.md) describing current situation,
|
||||
potential performance issue and ways to address it.
|
||||
|
||||
## Background
|
||||
|
||||
What we have currently is very close to etcd variant described in
|
||||
015-storage-messaging. Basically, we have single `SkTimelineInfo` message
|
||||
periodically sent by all safekeepers to etcd for each timeline.
|
||||
* Safekeepers subscribe to it to learn status of peers (currently they subscribe to
|
||||
'everything', but they can and should fetch data only for timelines they hold).
|
||||
* Pageserver subscribes to it (separate watch per timeline) to learn safekeepers
|
||||
positions; based on that, it decides from which safekeepers to pull WAL.
|
||||
|
||||
Also, safekeepers use etcd elections API to make sure only single safekeeper
|
||||
offloads WAL.
|
||||
|
||||
It works, and callmemaybe is gone. However, this has a performance
|
||||
hazard. Currently deployed etcd can do about 6k puts per second (using its own
|
||||
`benchmark` tool); on my 6 core laptop, while running on tmpfs, this gets to
|
||||
35k. Making benchmark closer to our usage [etcd watch bench](https://github.com/arssher/etcd-client/blob/watch-bench/examples/watch_bench.rs),
|
||||
I get ~10k received messages per second with various number of publisher-subscribers
|
||||
(laptop, tmpfs). Diving this by 12 (3 sks generate msg, 1 ps + 3 sk consume them) we
|
||||
get about 800 active timelines, if message is sent each second. Not extremely
|
||||
low, but quite reachable.
|
||||
|
||||
A lot of idle watches seem to be ok though -- which is good, as pageserver
|
||||
subscribes to all its timelines regardless of their activity.
|
||||
|
||||
Also, running etcd with fsyncs disabled is messy -- data dir must be wiped on
|
||||
each restart or there is a risk of corruption errors.
|
||||
|
||||
The reason is etcd making much more than what we need; it is a fault tolerant
|
||||
store with strong consistency, but I claim all we need here is just simplest pub
|
||||
sub with best effort delivery, because
|
||||
* We already have centralized source of truth for long running data, like which
|
||||
tlis are on which nodes -- the console.
|
||||
* Momentary data (safekeeper/pageserver progress) doesn't make sense to persist.
|
||||
Instead of putting each change to broker, expecting it to reliably deliver it
|
||||
is better to just have constant flow of data for active timelines: 1) they
|
||||
serve as natural heartbeats -- if node can't send, we shouldn't pull WAL from
|
||||
it 2) it is simpler -- no need to track delivery to/from the broker.
|
||||
Moreover, latency here is important: the faster we obtain fresh data, the
|
||||
faster we can switch to proper safekeeper after failure.
|
||||
* As for WAL offloading leader election, it is trivial to achieve through these
|
||||
heartbeats -- just take suitable node through deterministic rule (min node
|
||||
id). Once network is stable, this is a converging process (well, except
|
||||
complicated failure topology, but even then making it converge is not
|
||||
hard). Such elections bear some risk of several offloaders running
|
||||
concurrently for a short period of time, but that's harmless.
|
||||
|
||||
Generally, if one needs strong consistency, electing leader per se is not
|
||||
enough; it must be accompanied with number (logical clock ts), checked at
|
||||
every action to track causality. s3 doesn't provide CAS, so it can't
|
||||
differentiate old/new leader, this must be solved differently.
|
||||
|
||||
We could use etcd CAS (its most powerful/useful primitive actually) to issue
|
||||
these leader numbers (and e.g. prefix files in s3), but currently I don't see
|
||||
need for that.
|
||||
|
||||
|
||||
Obviously best effort pub sub is much more simpler and performant; the one proposed is
|
||||
|
||||
## gRPC broker
|
||||
|
||||
I took tonic and [prototyped](https://github.com/neondatabase/neon/blob/asher/neon-broker/broker/src/broker.rs) the replacement of functionality we currently use
|
||||
with grpc streams and tokio mpsc channels. The implementation description is at the file header.
|
||||
|
||||
It is just 500 lines of code and core functionality is complete. 1-1 pub sub
|
||||
gives about 120k received messages per second; having multiple subscribers in
|
||||
different connecitons quickly scales to 1 million received messages per second.
|
||||
I had concerns about many concurrent streams in singe connection, but 2^20
|
||||
subscribers still work (though eat memory, with 10 publishers 20GB are consumed;
|
||||
in this implementation each publisher holds full copy of all subscribers). There
|
||||
is `bench.rs` nearby which I used for testing.
|
||||
|
||||
`SkTimelineInfo` is wired here, but another message can be added (e.g. if
|
||||
pageservers want to communicate with each other) with templating.
|
||||
|
||||
### Fault tolerance
|
||||
|
||||
Since such broker is stateless, we can run it under k8s. Or add proxying to
|
||||
other members, with best-effort this is simple.
|
||||
|
||||
### Security implications
|
||||
|
||||
Communication happens in a private network that is not exposed to users;
|
||||
additionaly we can add auth to the broker.
|
||||
|
||||
## Alternative: get existing pub-sub
|
||||
|
||||
We could take some existing pub sub solution, e.g. RabbitMQ, Redis. But in this
|
||||
case IMV simplicity of our own outweights external dependency costs (RabbitMQ is
|
||||
much more complicated and needs VM; Redis Rust client maintenance is not
|
||||
ideal...). Also note that projects like CockroachDB and TiDB are based on gRPC
|
||||
as well.
|
||||
|
||||
## Alternative: direct communication
|
||||
|
||||
Apart from being transport, broker solves one more task: discovery, i.e. letting
|
||||
safekeepers and pageservers find each other. We can let safekeepers know, for
|
||||
each timeline, both other safekeepers for this timeline and pageservers serving
|
||||
it. In this case direct communication is possible:
|
||||
- each safekeeper pushes to each other safekeeper status of timelines residing
|
||||
on both of them, letting remove WAL, decide who offloads, decide on peer
|
||||
recovery;
|
||||
- each safekeeper pushes to each pageserver status of timelines residing on
|
||||
both of them, letting pageserver choose from which sk to pull WAL;
|
||||
|
||||
It was mostly described in [014-safekeeper-gossip](https://github.com/neondatabase/neon/blob/main/docs/rfcs/014-safekeepers-gossip.md), but I want to recap on that.
|
||||
|
||||
The main pro is less one dependency: less moving parts, easier to run Neon
|
||||
locally/manually, less places to monitor. Fault tolerance for broker disappears,
|
||||
no kuber or something. To me this is a big thing.
|
||||
|
||||
Also (though not a big thing) idle watches for inactive timelines disappear:
|
||||
naturally safekeepers learn about compute connection first and start pushing
|
||||
status to pageserver(s), notifying it should pull.
|
||||
|
||||
Importantly, I think that eventually knowing and persisting peers and
|
||||
pageservers on safekeepers is inevitable:
|
||||
- Knowing peer safekeepers for the timeline is required for correct
|
||||
automatic membership change -- new member set must be hardened on old
|
||||
majority before proceeding. It is required to get rid of sync-safekeepers
|
||||
as well (peer recovery up to flush_lsn).
|
||||
- Knowing pageservers where the timeline is attached is needed to
|
||||
1. Understand when to shut down activity on the timeline, i.e. push data to
|
||||
the broker. We can have a lot of timelines sleeping quietly which
|
||||
shouldn't occupy resources.
|
||||
2. Preserve WAL for these (currently we offload to s3 and take it from there,
|
||||
but serving locally is better, and we get one less condition on which WAL
|
||||
can be removed from s3).
|
||||
|
||||
I suppose this membership data should be passed to safekeepers directly from the
|
||||
console because
|
||||
1. Console is the original source of this data, conceptually this is the
|
||||
simplest way (rather than passing it through compute or something).
|
||||
2. We already have similar code for deleting timeline on safekeepers
|
||||
(and attaching/detaching timeline on pageserver), this is a typical
|
||||
action -- queue operation against storage node and execute it until it
|
||||
completes (or timeline is dropped).
|
||||
|
||||
Cons of direct communication are
|
||||
- It is more complicated: each safekeeper should maintain set of peers it talks
|
||||
to, and set of timelines for each such peer -- they ought to be multiplexed
|
||||
into single connection.
|
||||
- Totally, we have O(n^2) connections instead of O(n) with broker schema
|
||||
(still O(n) on each node). However, these are relatively stable, async and
|
||||
thus not very expensive, I don't think this is a big problem. Up to 10k
|
||||
storage nodes I doubt connection overhead would be noticeable.
|
||||
|
||||
I'd use gRPC for direct communication, and in this sense gRPC based broker is a
|
||||
step towards it.
|
||||
@@ -8,7 +8,7 @@
|
||||
regex = "1.4.5"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "1.12.0"
|
||||
serde_with = "2.0"
|
||||
once_cell = "1.13.0"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
|
||||
@@ -77,6 +77,16 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
|
||||
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
||||
];
|
||||
|
||||
pub fn set_build_info_metric(revision: &str) {
|
||||
let metric = register_int_gauge_vec!(
|
||||
"libmetrics_build_info",
|
||||
"Build/version information",
|
||||
&["revision"]
|
||||
)
|
||||
.expect("Failed to register build info metric");
|
||||
metric.with_label_values(&[revision]).set(1);
|
||||
}
|
||||
|
||||
// Records I/O stats in a "cross-platform" way.
|
||||
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
|
||||
// An alternative is to read procfs (`/proc/[pid]/io`) which does not work under macOS at all, hence abandoned.
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
serde_with = "2.0"
|
||||
const_format = "0.2.21"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
|
||||
@@ -123,9 +123,15 @@ pub struct TenantInfo {
|
||||
pub has_in_progress_downloads: Option<bool>,
|
||||
}
|
||||
|
||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct LocalTimelineInfo {
|
||||
pub struct TimelineInfo {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub timeline_id: TimelineId,
|
||||
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
@@ -149,28 +155,33 @@ pub struct LocalTimelineInfo {
|
||||
/// the timestamp (in microseconds) of the last received message
|
||||
pub last_received_msg_ts: Option<u128>,
|
||||
pub pg_version: u32,
|
||||
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub remote_consistent_lsn: Option<Lsn>,
|
||||
pub awaits_download: bool,
|
||||
|
||||
// Some of the above fields are duplicated in 'local' and 'remote', for backwards-
|
||||
// compatility with older clients.
|
||||
pub local: LocalTimelineInfo,
|
||||
pub remote: RemoteTimelineInfo,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct LocalTimelineInfo {
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_lsn: Option<Lsn>,
|
||||
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct RemoteTimelineInfo {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
pub awaits_download: bool,
|
||||
}
|
||||
|
||||
///
|
||||
/// This represents the output of the "timeline_detail" API call.
|
||||
///
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineInfo {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub timeline_id: TimelineId,
|
||||
pub local: Option<LocalTimelineInfo>,
|
||||
pub remote: Option<RemoteTimelineInfo>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub remote_consistent_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
@@ -13,7 +13,7 @@ crc32c = "0.6.0"
|
||||
hex = "0.4.3"
|
||||
once_cell = "1.13.0"
|
||||
log = "0.4.14"
|
||||
memoffset = "0.6.2"
|
||||
memoffset = "0.7"
|
||||
thiserror = "1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
utils = { path = "../utils" }
|
||||
@@ -26,4 +26,4 @@ wal_craft = { path = "wal_craft" }
|
||||
|
||||
[build-dependencies]
|
||||
anyhow = "1.0"
|
||||
bindgen = "0.60.1"
|
||||
bindgen = "0.61"
|
||||
|
||||
@@ -7,7 +7,7 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
clap = "3.0"
|
||||
clap = "4.0"
|
||||
env_logger = "0.9"
|
||||
log = "0.4"
|
||||
once_cell = "1.13.0"
|
||||
|
||||
@@ -1,68 +1,19 @@
|
||||
use anyhow::*;
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use std::str::FromStr;
|
||||
use clap::{value_parser, Arg, ArgMatches, Command};
|
||||
use std::{path::PathBuf, str::FromStr};
|
||||
use wal_craft::*;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info"))
|
||||
.init();
|
||||
let type_arg = &Arg::new("type")
|
||||
.takes_value(true)
|
||||
.help("Type of WAL to craft")
|
||||
.possible_values([
|
||||
Simple::NAME,
|
||||
LastWalRecordXlogSwitch::NAME,
|
||||
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME,
|
||||
WalRecordCrossingSegmentFollowedBySmallOne::NAME,
|
||||
LastWalRecordCrossingSegment::NAME,
|
||||
])
|
||||
.required(true);
|
||||
let arg_matches = App::new("Postgres WAL crafter")
|
||||
.about("Crafts Postgres databases with specific WAL properties")
|
||||
.subcommand(
|
||||
App::new("print-postgres-config")
|
||||
.about("Print the configuration required for PostgreSQL server before running this script")
|
||||
)
|
||||
.subcommand(
|
||||
App::new("with-initdb")
|
||||
.about("Craft WAL in a new data directory first initialized with initdb")
|
||||
.arg(type_arg)
|
||||
.arg(
|
||||
Arg::new("datadir")
|
||||
.takes_value(true)
|
||||
.help("Data directory for the Postgres server")
|
||||
.required(true)
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pg-distrib-dir")
|
||||
.long("pg-distrib-dir")
|
||||
.takes_value(true)
|
||||
.help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)")
|
||||
.default_value("/usr/local")
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pg-version")
|
||||
.long("pg-version")
|
||||
.help("Postgres version to use for the initial tenant")
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
App::new("in-existing")
|
||||
.about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.")
|
||||
.arg(type_arg)
|
||||
.arg(
|
||||
Arg::new("connection")
|
||||
.takes_value(true)
|
||||
.help("Connection string to the Postgres database to populate")
|
||||
.required(true)
|
||||
)
|
||||
)
|
||||
.get_matches();
|
||||
let arg_matches = cli().get_matches();
|
||||
|
||||
let wal_craft = |arg_matches: &ArgMatches, client| {
|
||||
let (intermediate_lsns, end_of_wal_lsn) = match arg_matches.value_of("type").unwrap() {
|
||||
let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
|
||||
.get_one::<String>("type")
|
||||
.map(|s| s.as_str())
|
||||
.context("'type' is required")?
|
||||
{
|
||||
Simple::NAME => Simple::craft(client)?,
|
||||
LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?,
|
||||
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => {
|
||||
@@ -72,12 +23,12 @@ fn main() -> Result<()> {
|
||||
WalRecordCrossingSegmentFollowedBySmallOne::craft(client)?
|
||||
}
|
||||
LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
|
||||
a => panic!("Unknown --type argument: {}", a),
|
||||
a => panic!("Unknown --type argument: {a}"),
|
||||
};
|
||||
for lsn in intermediate_lsns {
|
||||
println!("intermediate_lsn = {}", lsn);
|
||||
println!("intermediate_lsn = {lsn}");
|
||||
}
|
||||
println!("end_of_wal = {}", end_of_wal_lsn);
|
||||
println!("end_of_wal = {end_of_wal_lsn}");
|
||||
Ok(())
|
||||
};
|
||||
|
||||
@@ -85,20 +36,24 @@ fn main() -> Result<()> {
|
||||
None => panic!("No subcommand provided"),
|
||||
Some(("print-postgres-config", _)) => {
|
||||
for cfg in REQUIRED_POSTGRES_CONFIG.iter() {
|
||||
println!("{}", cfg);
|
||||
println!("{cfg}");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Some(("with-initdb", arg_matches)) => {
|
||||
let cfg = Conf {
|
||||
pg_version: arg_matches
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.context("Failed to parse postgres version from the argument string")?,
|
||||
pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
|
||||
datadir: arg_matches.value_of("datadir").unwrap().into(),
|
||||
pg_version: *arg_matches
|
||||
.get_one::<u32>("pg-version")
|
||||
.context("'pg-version' is required")?,
|
||||
pg_distrib_dir: arg_matches
|
||||
.get_one::<PathBuf>("pg-distrib-dir")
|
||||
.context("'pg-distrib-dir' is required")?
|
||||
.to_owned(),
|
||||
datadir: arg_matches
|
||||
.get_one::<PathBuf>("datadir")
|
||||
.context("'datadir' is required")?
|
||||
.to_owned(),
|
||||
};
|
||||
cfg.initdb()?;
|
||||
let srv = cfg.start_server()?;
|
||||
@@ -108,9 +63,77 @@ fn main() -> Result<()> {
|
||||
}
|
||||
Some(("in-existing", arg_matches)) => wal_craft(
|
||||
arg_matches,
|
||||
&mut postgres::Config::from_str(arg_matches.value_of("connection").unwrap())?
|
||||
.connect(postgres::NoTls)?,
|
||||
&mut postgres::Config::from_str(
|
||||
arg_matches
|
||||
.get_one::<String>("connection")
|
||||
.context("'connection' is required")?,
|
||||
)
|
||||
.context(
|
||||
"'connection' argument value could not be parsed as a postgres connection string",
|
||||
)?
|
||||
.connect(postgres::NoTls)?,
|
||||
),
|
||||
Some(_) => panic!("Unknown subcommand"),
|
||||
}
|
||||
}
|
||||
|
||||
fn cli() -> Command {
|
||||
let type_arg = &Arg::new("type")
|
||||
.help("Type of WAL to craft")
|
||||
.value_parser([
|
||||
Simple::NAME,
|
||||
LastWalRecordXlogSwitch::NAME,
|
||||
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME,
|
||||
WalRecordCrossingSegmentFollowedBySmallOne::NAME,
|
||||
LastWalRecordCrossingSegment::NAME,
|
||||
])
|
||||
.required(true);
|
||||
|
||||
Command::new("Postgres WAL crafter")
|
||||
.about("Crafts Postgres databases with specific WAL properties")
|
||||
.subcommand(
|
||||
Command::new("print-postgres-config")
|
||||
.about("Print the configuration required for PostgreSQL server before running this script")
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("with-initdb")
|
||||
.about("Craft WAL in a new data directory first initialized with initdb")
|
||||
.arg(type_arg)
|
||||
.arg(
|
||||
Arg::new("datadir")
|
||||
.help("Data directory for the Postgres server")
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.required(true)
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pg-distrib-dir")
|
||||
.long("pg-distrib-dir")
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)")
|
||||
.default_value("/usr/local")
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pg-version")
|
||||
.long("pg-version")
|
||||
.help("Postgres version to use for the initial tenant")
|
||||
.value_parser(value_parser!(u32))
|
||||
.required(true)
|
||||
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("in-existing")
|
||||
.about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.")
|
||||
.arg(type_arg)
|
||||
.arg(
|
||||
Arg::new("connection")
|
||||
.help("Connection string to the Postgres database to populate")
|
||||
.required(true)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_cli() {
|
||||
cli().debug_assert();
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
toml_edit = { version = "0.13", features = ["easy"] }
|
||||
toml_edit = { version = "0.14", features = ["easy"] }
|
||||
tracing = "0.1.27"
|
||||
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
serde_with = "2.0"
|
||||
const_format = "0.2.21"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
|
||||
@@ -1,8 +1,24 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TimelineCreateRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub timeline_id: TimelineId,
|
||||
pub peer_ids: Vec<NodeId>,
|
||||
pub peer_ids: Option<Vec<NodeId>>,
|
||||
pub pg_version: u32,
|
||||
pub system_id: Option<u64>,
|
||||
pub wal_seg_size: Option<u32>,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub commit_lsn: Lsn,
|
||||
// If not passed, it is assigned to the beginning of commit_lsn segment.
|
||||
pub local_start_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ tokio = { version = "1.17", features = ["macros"]}
|
||||
tokio-rustls = "0.23"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
nix = "0.23.0"
|
||||
nix = "0.25"
|
||||
signal-hook = "0.3.10"
|
||||
rand = "0.8.3"
|
||||
jsonwebtoken = "8"
|
||||
@@ -28,7 +28,7 @@ hex = { version = "0.4.3", features = ["serde"] }
|
||||
rustls = "0.20.2"
|
||||
rustls-split = "0.3.0"
|
||||
git-version = "0.3.5"
|
||||
serde_with = "1.12.0"
|
||||
serde_with = "2.0"
|
||||
once_cell = "1.13.0"
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ byteorder = "1.4.3"
|
||||
bytes = "1.0.1"
|
||||
hex-literal = "0.3"
|
||||
tempfile = "3.2"
|
||||
criterion = "0.3"
|
||||
criterion = "0.4"
|
||||
rustls-pemfile = "1"
|
||||
|
||||
[[bench]]
|
||||
|
||||
@@ -66,6 +66,11 @@ impl Lsn {
|
||||
(self.0 % seg_sz as u64) as usize
|
||||
}
|
||||
|
||||
/// Compute LSN of the segment start.
|
||||
pub fn segment_lsn(self, seg_sz: usize) -> Lsn {
|
||||
Lsn(self.0 - (self.0 % seg_sz as u64))
|
||||
}
|
||||
|
||||
/// Compute the segment number
|
||||
pub fn segment_number(self, seg_sz: usize) -> u64 {
|
||||
self.0 / seg_sz as u64
|
||||
|
||||
@@ -15,7 +15,7 @@ use std::sync::Arc;
|
||||
use std::task::Poll;
|
||||
use tracing::{debug, error, trace};
|
||||
|
||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
|
||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
@@ -66,8 +66,8 @@ pub enum ProcessMsgResult {
|
||||
/// Always-writeable sock_split stream.
|
||||
/// May not be readable. See [`PostgresBackend::take_stream_in`]
|
||||
pub enum Stream {
|
||||
Unencrypted(tokio::net::TcpStream),
|
||||
Tls(Box<tokio_rustls::server::TlsStream<tokio::net::TcpStream>>),
|
||||
Unencrypted(BufReader<tokio::net::TcpStream>),
|
||||
Tls(Box<tokio_rustls::server::TlsStream<BufReader<tokio::net::TcpStream>>>),
|
||||
Broken,
|
||||
}
|
||||
|
||||
@@ -157,7 +157,7 @@ impl PostgresBackend {
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
|
||||
Ok(Self {
|
||||
stream: Stream::Unencrypted(socket),
|
||||
stream: Stream::Unencrypted(BufReader::new(socket)),
|
||||
buf_out: BytesMut::with_capacity(10 * 1024),
|
||||
state: ProtoState::Initialization,
|
||||
md5_salt: [0u8; 4],
|
||||
|
||||
@@ -10,6 +10,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
fmt,
|
||||
future::Future,
|
||||
io::{self, Cursor},
|
||||
str,
|
||||
@@ -124,6 +125,19 @@ pub struct CancelKeyData {
|
||||
pub cancel_key: i32,
|
||||
}
|
||||
|
||||
impl fmt::Display for CancelKeyData {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let hi = (self.backend_pid as u64) << 32;
|
||||
let lo = self.cancel_key as u64;
|
||||
let id = hi | lo;
|
||||
|
||||
// This format is more compact and might work better for logs.
|
||||
f.debug_tuple("CancelKeyData")
|
||||
.field(&format_args!("{:x}", id))
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
use rand::distributions::{Distribution, Standard};
|
||||
impl Distribution<CancelKeyData> for Standard {
|
||||
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> CancelKeyData {
|
||||
|
||||
@@ -23,7 +23,7 @@ futures = "0.3.13"
|
||||
hex = "0.4.3"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
clap = "3.0"
|
||||
clap = { version = "4.0", features = ["string"] }
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
|
||||
@@ -38,25 +38,25 @@ tar = "0.4.33"
|
||||
humantime = "2.1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "1.12.0"
|
||||
serde_with = "2.0"
|
||||
humantime-serde = "1.1.1"
|
||||
|
||||
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
|
||||
|
||||
toml_edit = { version = "0.13", features = ["easy"] }
|
||||
toml_edit = { version = "0.14", features = ["easy"] }
|
||||
scopeguard = "1.1.0"
|
||||
const_format = "0.2.21"
|
||||
tracing = "0.1.36"
|
||||
signal-hook = "0.3.10"
|
||||
url = "2"
|
||||
nix = "0.23"
|
||||
nix = "0.25"
|
||||
once_cell = "1.13.0"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = "0.5.0"
|
||||
git-version = "0.3.5"
|
||||
rstar = "0.9.3"
|
||||
num-traits = "0.2.15"
|
||||
amplify_num = "0.4.1"
|
||||
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
|
||||
|
||||
pageserver_api = { path = "../libs/pageserver_api" }
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
@@ -67,7 +67,13 @@ remote_storage = { path = "../libs/remote_storage" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
close_fds = "0.3.2"
|
||||
walkdir = "2.3.2"
|
||||
dashmap = "5.4.0"
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.4"
|
||||
hex-literal = "0.3"
|
||||
tempfile = "3.2"
|
||||
|
||||
[[bench]]
|
||||
name = "bench_layer_map"
|
||||
harness = false
|
||||
|
||||
5866
pageserver/benches/bench_layer_map.rs
Normal file
5866
pageserver/benches/bench_layer_map.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,35 +0,0 @@
|
||||
//! Main entry point for the dump_layerfile executable
|
||||
//!
|
||||
//! A handy tool for debugging, that's all.
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
use pageserver::page_cache;
|
||||
use pageserver::tenant::dump_layerfile_from_path;
|
||||
use pageserver::virtual_file;
|
||||
use std::path::PathBuf;
|
||||
use utils::project_git_version;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Neon dump_layerfile utility")
|
||||
.about("Dump contents of one layer file, for debugging")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("path")
|
||||
.help("Path to file to dump")
|
||||
.required(true)
|
||||
.index(1),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let path = PathBuf::from(arg_matches.value_of("path").unwrap());
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10);
|
||||
page_cache::init(100);
|
||||
|
||||
dump_layerfile_from_path(&path, true)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -6,10 +6,12 @@ use tracing::*;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
|
||||
use clap::{App, Arg};
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use fail::FailScenario;
|
||||
use metrics::set_build_info_metric;
|
||||
|
||||
use pageserver::{
|
||||
config::{defaults::*, PageServerConf},
|
||||
http, page_cache, page_service, profiling, task_mgr,
|
||||
@@ -31,72 +33,35 @@ use utils::{
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const FEATURES: &[&str] = &[
|
||||
#[cfg(feature = "testing")]
|
||||
"testing",
|
||||
#[cfg(feature = "fail/failpoints")]
|
||||
"fail/failpoints",
|
||||
#[cfg(feature = "profiling")]
|
||||
"profiling",
|
||||
];
|
||||
|
||||
fn version() -> String {
|
||||
format!(
|
||||
"{GIT_VERSION} profiling:{} failpoints:{}",
|
||||
cfg!(feature = "profiling"),
|
||||
fail::has_failpoints()
|
||||
"{GIT_VERSION} failpoints: {}, features: {:?}",
|
||||
fail::has_failpoints(),
|
||||
FEATURES,
|
||||
)
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = App::new("Neon page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.version(&*version())
|
||||
.arg(
|
||||
let arg_matches = cli().get_matches();
|
||||
|
||||
Arg::new("daemonize")
|
||||
.short('d')
|
||||
.long("daemonize")
|
||||
.takes_value(false)
|
||||
.help("Run in the background"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("init")
|
||||
.long("init")
|
||||
.takes_value(false)
|
||||
.help("Initialize pageserver with all given config overrides"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("workdir")
|
||||
.short('D')
|
||||
.long("workdir")
|
||||
.takes_value(true)
|
||||
.help("Working directory for the pageserver"),
|
||||
)
|
||||
// See `settings.md` for more details on the extra configuration patameters pageserver can process
|
||||
.arg(
|
||||
Arg::new("config-override")
|
||||
.short('c')
|
||||
.takes_value(true)
|
||||
.number_of_values(1)
|
||||
.multiple_occurrences(true)
|
||||
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there).
|
||||
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
|
||||
)
|
||||
.arg(Arg::new("update-config").long("update-config").takes_value(false).help(
|
||||
"Update the config file when started",
|
||||
))
|
||||
.arg(
|
||||
Arg::new("enabled-features")
|
||||
.long("enabled-features")
|
||||
.takes_value(false)
|
||||
.help("Show enabled compile time features"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
if arg_matches.is_present("enabled-features") {
|
||||
let features: &[&str] = &[
|
||||
#[cfg(feature = "testing")]
|
||||
"testing",
|
||||
#[cfg(feature = "profiling")]
|
||||
"profiling",
|
||||
];
|
||||
println!("{{\"features\": {features:?} }}");
|
||||
if arg_matches.get_flag("enabled-features") {
|
||||
println!("{{\"features\": {FEATURES:?} }}");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".neon"));
|
||||
let workdir = arg_matches
|
||||
.get_one::<String>("workdir")
|
||||
.map(Path::new)
|
||||
.unwrap_or_else(|| Path::new(".neon"));
|
||||
let workdir = workdir
|
||||
.canonicalize()
|
||||
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
|
||||
@@ -110,7 +75,7 @@ fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
})?;
|
||||
|
||||
let daemonize = arg_matches.is_present("daemonize");
|
||||
let daemonize = arg_matches.get_flag("daemonize");
|
||||
|
||||
let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
|
||||
ControlFlow::Continue(conf) => conf,
|
||||
@@ -148,8 +113,8 @@ fn initialize_config(
|
||||
arg_matches: clap::ArgMatches,
|
||||
workdir: &Path,
|
||||
) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
|
||||
let init = arg_matches.is_present("init");
|
||||
let update_config = init || arg_matches.is_present("update-config");
|
||||
let init = arg_matches.get_flag("init");
|
||||
let update_config = init || arg_matches.get_flag("update-config");
|
||||
|
||||
let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
|
||||
if init {
|
||||
@@ -191,13 +156,10 @@ fn initialize_config(
|
||||
)
|
||||
};
|
||||
|
||||
if let Some(values) = arg_matches.values_of("config-override") {
|
||||
if let Some(values) = arg_matches.get_many::<String>("config-override") {
|
||||
for option_line in values {
|
||||
let doc = toml_edit::Document::from_str(option_line).with_context(|| {
|
||||
format!(
|
||||
"Option '{}' could not be parsed as a toml document",
|
||||
option_line
|
||||
)
|
||||
format!("Option '{option_line}' could not be parsed as a toml document")
|
||||
})?;
|
||||
|
||||
for (key, item) in doc.iter() {
|
||||
@@ -239,7 +201,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
// Initialize logger
|
||||
let log_file = logging::init(LOG_FILE_NAME, daemonize)?;
|
||||
|
||||
info!("version: {GIT_VERSION}");
|
||||
info!("version: {}", version());
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
@@ -356,6 +318,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
},
|
||||
);
|
||||
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
signals.handle(|signal| match signal {
|
||||
Signal::Quit => {
|
||||
@@ -378,3 +342,55 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn cli() -> Command {
|
||||
Command::new("Neon page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.version(version())
|
||||
.arg(
|
||||
|
||||
Arg::new("daemonize")
|
||||
.short('d')
|
||||
.long("daemonize")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Run in the background"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("init")
|
||||
.long("init")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Initialize pageserver with all given config overrides"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("workdir")
|
||||
.short('D')
|
||||
.long("workdir")
|
||||
.help("Working directory for the pageserver"),
|
||||
)
|
||||
// See `settings.md` for more details on the extra configuration patameters pageserver can process
|
||||
.arg(
|
||||
Arg::new("config-override")
|
||||
.short('c')
|
||||
.num_args(1)
|
||||
.action(ArgAction::Append)
|
||||
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
|
||||
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("update-config")
|
||||
.long("update-config")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Update the config file when started"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("enabled-features")
|
||||
.long("enabled-features")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Show enabled compile time features"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_cli() {
|
||||
cli().debug_assert();
|
||||
}
|
||||
|
||||
154
pageserver/src/bin/pageserver_binutils.rs
Normal file
154
pageserver/src/bin/pageserver_binutils.rs
Normal file
@@ -0,0 +1,154 @@
|
||||
//! A helper tool to manage pageserver binary files.
|
||||
//! Accepts a file as an argument, attempts to parse it with all ways possible
|
||||
//! and prints its interpreted context.
|
||||
//!
|
||||
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
str::FromStr,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use clap::{value_parser, Arg, Command};
|
||||
|
||||
use pageserver::{
|
||||
page_cache,
|
||||
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
||||
virtual_file,
|
||||
};
|
||||
use postgres_ffi::ControlFileData;
|
||||
use utils::{lsn::Lsn, project_git_version};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const METADATA_SUBCOMMAND: &str = "metadata";
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = cli().get_matches();
|
||||
|
||||
match arg_matches.subcommand() {
|
||||
Some((subcommand_name, subcommand_matches)) => {
|
||||
let path = subcommand_matches
|
||||
.get_one::<PathBuf>("metadata_path")
|
||||
.context("'metadata_path' argument is missing")?
|
||||
.to_path_buf();
|
||||
anyhow::ensure!(
|
||||
subcommand_name == METADATA_SUBCOMMAND,
|
||||
"Unknown subcommand {subcommand_name}"
|
||||
);
|
||||
handle_metadata(&path, subcommand_matches)?;
|
||||
}
|
||||
None => {
|
||||
let path = arg_matches
|
||||
.get_one::<PathBuf>("path")
|
||||
.context("'path' argument is missing")?
|
||||
.to_path_buf();
|
||||
println!(
|
||||
"No subcommand specified, attempting to guess the format for file {}",
|
||||
path.display()
|
||||
);
|
||||
if let Err(e) = read_pg_control_file(&path) {
|
||||
println!(
|
||||
"Failed to read input file as a pg control one: {e:#}\n\
|
||||
Attempting to read it as layer file"
|
||||
);
|
||||
print_layerfile(&path)?;
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
|
||||
let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?;
|
||||
println!("{control_file:?}");
|
||||
let control_file_initdb = Lsn(control_file.checkPoint);
|
||||
println!(
|
||||
"pg_initdb_lsn: {}, aligned: {}",
|
||||
control_file_initdb,
|
||||
control_file_initdb.align()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_layerfile(path: &Path) -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10);
|
||||
page_cache::init(100);
|
||||
dump_layerfile_from_path(path, true)
|
||||
}
|
||||
|
||||
fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
|
||||
let metadata_bytes = std::fs::read(&path)?;
|
||||
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
||||
println!("Current metadata:\n{meta:?}");
|
||||
let mut update_meta = false;
|
||||
if let Some(disk_consistent_lsn) = arg_matches.get_one::<String>("disk_consistent_lsn") {
|
||||
meta = TimelineMetadata::new(
|
||||
Lsn::from_str(disk_consistent_lsn)?,
|
||||
meta.prev_record_lsn(),
|
||||
meta.ancestor_timeline(),
|
||||
meta.ancestor_lsn(),
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
if let Some(prev_record_lsn) = arg_matches.get_one::<String>("prev_record_lsn") {
|
||||
meta = TimelineMetadata::new(
|
||||
meta.disk_consistent_lsn(),
|
||||
Some(Lsn::from_str(prev_record_lsn)?),
|
||||
meta.ancestor_timeline(),
|
||||
meta.ancestor_lsn(),
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
|
||||
if update_meta {
|
||||
let metadata_bytes = meta.to_bytes()?;
|
||||
std::fs::write(&path, &metadata_bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cli() -> Command {
|
||||
Command::new("Neon Pageserver binutils")
|
||||
.about("Reads pageserver (and related) binary files management utility")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("path")
|
||||
.help("Input file path")
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.required(false),
|
||||
)
|
||||
.subcommand(
|
||||
Command::new(METADATA_SUBCOMMAND)
|
||||
.about("Read and update pageserver metadata file")
|
||||
.arg(
|
||||
Arg::new("metadata_path")
|
||||
.help("Input metadata file path")
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.required(false),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("disk_consistent_lsn")
|
||||
.long("disk_consistent_lsn")
|
||||
.help("Replace disk consistent Lsn"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("prev_record_lsn")
|
||||
.long("prev_record_lsn")
|
||||
.help("Replace previous record Lsn"),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_cli() {
|
||||
cli().debug_assert();
|
||||
}
|
||||
@@ -1,75 +0,0 @@
|
||||
//! Main entry point for the edit_metadata executable
|
||||
//!
|
||||
//! A handy tool for debugging, that's all.
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
use pageserver::tenant::metadata::TimelineMetadata;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use utils::{lsn::Lsn, project_git_version};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Neon update metadata utility")
|
||||
.about("Dump or update metadata file")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("path")
|
||||
.help("Path to metadata file")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("disk_lsn")
|
||||
.short('d')
|
||||
.long("disk_lsn")
|
||||
.takes_value(true)
|
||||
.help("Replace disk constistent lsn"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("prev_lsn")
|
||||
.short('p')
|
||||
.long("prev_lsn")
|
||||
.takes_value(true)
|
||||
.help("Previous record LSN"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let path = PathBuf::from(arg_matches.value_of("path").unwrap());
|
||||
let metadata_bytes = std::fs::read(&path)?;
|
||||
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
||||
println!("Current metadata:\n{:?}", &meta);
|
||||
|
||||
let mut update_meta = false;
|
||||
|
||||
if let Some(disk_lsn) = arg_matches.value_of("disk_lsn") {
|
||||
meta = TimelineMetadata::new(
|
||||
Lsn::from_str(disk_lsn)?,
|
||||
meta.prev_record_lsn(),
|
||||
meta.ancestor_timeline(),
|
||||
meta.ancestor_lsn(),
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
|
||||
if let Some(prev_lsn) = arg_matches.value_of("prev_lsn") {
|
||||
meta = TimelineMetadata::new(
|
||||
meta.disk_consistent_lsn(),
|
||||
Some(Lsn::from_str(prev_lsn)?),
|
||||
meta.ancestor_timeline(),
|
||||
meta.ancestor_lsn(),
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
if update_meta {
|
||||
let metadata_bytes = meta.to_bytes()?;
|
||||
std::fs::write(&path, &metadata_bytes)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,7 +1,11 @@
|
||||
openapi: "3.0.2"
|
||||
info:
|
||||
title: Page Server API
|
||||
description: Neon Pageserver API
|
||||
version: "1.0"
|
||||
license:
|
||||
name: "Apache"
|
||||
url: https://github.com/neondatabase/neon/blob/main/LICENSE
|
||||
servers:
|
||||
- url: ""
|
||||
paths:
|
||||
@@ -207,7 +211,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -612,6 +615,9 @@ components:
|
||||
required:
|
||||
- timeline_id
|
||||
- tenant_id
|
||||
- last_record_lsn
|
||||
- disk_consistent_lsn
|
||||
- awaits_download
|
||||
properties:
|
||||
timeline_id:
|
||||
type: string
|
||||
@@ -619,33 +625,15 @@ components:
|
||||
tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
local:
|
||||
$ref: "#/components/schemas/LocalTimelineInfo"
|
||||
remote:
|
||||
$ref: "#/components/schemas/RemoteTimelineInfo"
|
||||
RemoteTimelineInfo:
|
||||
type: object
|
||||
required:
|
||||
- awaits_download
|
||||
- remote_consistent_lsn
|
||||
properties:
|
||||
awaits_download:
|
||||
type: boolean
|
||||
remote_consistent_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
LocalTimelineInfo:
|
||||
type: object
|
||||
required:
|
||||
- last_record_lsn
|
||||
- disk_consistent_lsn
|
||||
properties:
|
||||
last_record_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
disk_consistent_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
remote_consistent_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
ancestor_timeline_id:
|
||||
type: string
|
||||
format: hex
|
||||
@@ -670,7 +658,39 @@ components:
|
||||
format: hex
|
||||
last_received_msg_ts:
|
||||
type: integer
|
||||
awaits_download:
|
||||
type: boolean
|
||||
|
||||
# These 'local' and 'remote' fields just duplicate some of the fields
|
||||
# above. They are kept for backwards-compatibility. They can be removed,
|
||||
# when the control plane has been updated to look at the above fields
|
||||
# directly.
|
||||
local:
|
||||
$ref: "#/components/schemas/LocalTimelineInfo"
|
||||
remote:
|
||||
$ref: "#/components/schemas/RemoteTimelineInfo"
|
||||
|
||||
LocalTimelineInfo:
|
||||
type: object
|
||||
properties:
|
||||
ancestor_timeline_id:
|
||||
type: string
|
||||
format: hex
|
||||
ancestor_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
current_logical_size:
|
||||
type: integer
|
||||
current_physical_size:
|
||||
type: integer
|
||||
RemoteTimelineInfo:
|
||||
type: object
|
||||
required:
|
||||
- remote_consistent_lsn
|
||||
properties:
|
||||
remote_consistent_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
Error:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -79,13 +79,13 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
|
||||
get_state(request).conf
|
||||
}
|
||||
|
||||
// Helper functions to construct a LocalTimelineInfo struct for a timeline
|
||||
|
||||
fn local_timeline_info_from_timeline(
|
||||
// Helper function to construct a TimelineInfo struct for a timeline
|
||||
async fn build_timeline_info(
|
||||
state: &State,
|
||||
timeline: &Arc<Timeline>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
include_non_incremental_physical_size: bool,
|
||||
) -> anyhow::Result<LocalTimelineInfo> {
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
|
||||
let guard = timeline.last_received_wal.lock().unwrap();
|
||||
@@ -100,24 +100,47 @@ fn local_timeline_info_from_timeline(
|
||||
}
|
||||
};
|
||||
|
||||
let info = LocalTimelineInfo {
|
||||
ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
|
||||
ancestor_lsn: {
|
||||
match timeline.get_ancestor_lsn() {
|
||||
Lsn(0) => None,
|
||||
lsn @ Lsn(_) => Some(lsn),
|
||||
}
|
||||
},
|
||||
let (remote_consistent_lsn, awaits_download) = if let Some(remote_entry) = state
|
||||
.remote_index
|
||||
.read()
|
||||
.await
|
||||
.timeline_entry(&TenantTimelineId {
|
||||
tenant_id: timeline.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
}) {
|
||||
(
|
||||
Some(remote_entry.metadata.disk_consistent_lsn()),
|
||||
remote_entry.awaits_download,
|
||||
)
|
||||
} else {
|
||||
(None, false)
|
||||
};
|
||||
|
||||
let ancestor_timeline_id = timeline.get_ancestor_timeline_id();
|
||||
let ancestor_lsn = match timeline.get_ancestor_lsn() {
|
||||
Lsn(0) => None,
|
||||
lsn @ Lsn(_) => Some(lsn),
|
||||
};
|
||||
let current_logical_size = match timeline.get_current_logical_size() {
|
||||
Ok(size) => Some(size),
|
||||
Err(err) => {
|
||||
error!("Timeline info creation failed to get current logical size: {err:?}");
|
||||
None
|
||||
}
|
||||
};
|
||||
let current_physical_size = Some(timeline.get_physical_size());
|
||||
|
||||
let info = TimelineInfo {
|
||||
tenant_id: timeline.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
ancestor_timeline_id,
|
||||
ancestor_lsn,
|
||||
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
|
||||
last_record_lsn,
|
||||
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
||||
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
|
||||
current_logical_size: Some(
|
||||
timeline
|
||||
.get_current_logical_size()
|
||||
.context("Timeline info creation failed to get current logical size")?,
|
||||
),
|
||||
current_physical_size: Some(timeline.get_physical_size()),
|
||||
current_logical_size,
|
||||
current_physical_size,
|
||||
current_logical_size_non_incremental: if include_non_incremental_logical_size {
|
||||
Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?)
|
||||
} else {
|
||||
@@ -132,32 +155,25 @@ fn local_timeline_info_from_timeline(
|
||||
last_received_msg_lsn,
|
||||
last_received_msg_ts,
|
||||
pg_version: timeline.pg_version,
|
||||
|
||||
remote_consistent_lsn,
|
||||
awaits_download,
|
||||
|
||||
// Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
|
||||
// with the control plane.
|
||||
local: LocalTimelineInfo {
|
||||
ancestor_timeline_id,
|
||||
ancestor_lsn,
|
||||
current_logical_size,
|
||||
current_physical_size,
|
||||
},
|
||||
remote: RemoteTimelineInfo {
|
||||
remote_consistent_lsn,
|
||||
},
|
||||
};
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
fn list_local_timelines(
|
||||
tenant_id: TenantId,
|
||||
include_non_incremental_logical_size: bool,
|
||||
include_non_incremental_physical_size: bool,
|
||||
) -> Result<Vec<(TimelineId, LocalTimelineInfo)>> {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
|
||||
let timelines = tenant.list_timelines();
|
||||
|
||||
let mut local_timeline_info = Vec::with_capacity(timelines.len());
|
||||
for (timeline_id, repository_timeline) in timelines {
|
||||
local_timeline_info.push((
|
||||
timeline_id,
|
||||
local_timeline_info_from_timeline(
|
||||
&repository_timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
)?,
|
||||
))
|
||||
}
|
||||
Ok(local_timeline_info)
|
||||
}
|
||||
|
||||
// healthcheck handler
|
||||
async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let config = get_config(&request);
|
||||
@@ -169,6 +185,8 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let new_timeline_info = async {
|
||||
match tenant.create_timeline(
|
||||
@@ -179,14 +197,10 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
).await {
|
||||
Ok(Some(new_timeline)) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)
|
||||
let timeline_info = build_timeline_info(state, &new_timeline, false, false)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
Ok(Some(TimelineInfo {
|
||||
tenant_id,
|
||||
timeline_id: new_timeline.timeline_id,
|
||||
local: Some(local_info),
|
||||
remote: None,
|
||||
}))
|
||||
Ok(Some(timeline_info))
|
||||
}
|
||||
Ok(None) => Ok(None), // timeline already exists
|
||||
Err(err) => Err(ApiError::InternalServerError(err)),
|
||||
@@ -209,6 +223,8 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
query_param_present(&request, "include-non-incremental-physical-size");
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timelines = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
@@ -218,36 +234,18 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
for (timeline_id, timeline) in timelines {
|
||||
let local = match local_timeline_info_from_timeline(
|
||||
for timeline in timelines {
|
||||
let timeline_info = build_timeline_info(
|
||||
state,
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
) {
|
||||
Ok(local) => Some(local),
|
||||
Err(e) => {
|
||||
error!("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}");
|
||||
None
|
||||
}
|
||||
};
|
||||
)
|
||||
.await
|
||||
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
response_data.push(TimelineInfo {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
local,
|
||||
remote: get_state(&request)
|
||||
.remote_index
|
||||
.read()
|
||||
.await
|
||||
.timeline_entry(&TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.awaits_download,
|
||||
}),
|
||||
})
|
||||
response_data.push(timeline_info);
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, response_data)
|
||||
@@ -292,59 +290,33 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
query_param_present(&request, "include-non-incremental-physical-size");
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let (local_timeline_info, remote_timeline_info) = async {
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline_info = async {
|
||||
let timeline = tokio::task::spawn_blocking(move || {
|
||||
tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id)
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let local_timeline_info = match timeline.and_then(|timeline| {
|
||||
local_timeline_info_from_timeline(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
)
|
||||
}) {
|
||||
Ok(local_info) => Some(local_info),
|
||||
Err(e) => {
|
||||
error!("Failed to get local timeline info: {e:#}");
|
||||
None
|
||||
}
|
||||
};
|
||||
let timeline = timeline.map_err(ApiError::NotFound)?;
|
||||
|
||||
let remote_timeline_info = {
|
||||
let remote_index_read = get_state(&request).remote_index.read().await;
|
||||
remote_index_read
|
||||
.timeline_entry(&TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.awaits_download,
|
||||
})
|
||||
};
|
||||
Ok::<_, ApiError>((local_timeline_info, remote_timeline_info))
|
||||
let timeline_info = build_timeline_info(
|
||||
state,
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
)
|
||||
.await
|
||||
.context("Failed to get local timeline info: {e:#}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
Ok::<_, ApiError>(timeline_info)
|
||||
}
|
||||
.instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await?;
|
||||
|
||||
if local_timeline_info.is_none() && remote_timeline_info.is_none() {
|
||||
Err(ApiError::NotFound(anyhow!(
|
||||
"Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely"
|
||||
)))
|
||||
} else {
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
TimelineInfo {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
local: local_timeline_info,
|
||||
remote: remote_timeline_info,
|
||||
},
|
||||
)
|
||||
}
|
||||
json_response(StatusCode::OK, timeline_info)
|
||||
}
|
||||
|
||||
async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -414,7 +386,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
}
|
||||
return json_response(StatusCode::ACCEPTED, ());
|
||||
}
|
||||
// no tenant in the index, release the lock to make the potentially lengthy download opetation
|
||||
// no tenant in the index, release the lock to make the potentially lengthy download operation
|
||||
drop(index_accessor);
|
||||
|
||||
// download index parts for every tenant timeline
|
||||
@@ -566,36 +538,27 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
false
|
||||
});
|
||||
|
||||
let tenant_state = match tenant {
|
||||
Ok(tenant) => tenant.current_state(),
|
||||
let (tenant_state, current_physical_size) = match tenant {
|
||||
Ok(tenant) => {
|
||||
let timelines = tenant.list_timelines();
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
for timeline in timelines {
|
||||
current_physical_size += timeline.get_physical_size();
|
||||
}
|
||||
|
||||
(tenant.current_state(), Some(current_physical_size))
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to get local tenant state: {e:#}");
|
||||
if has_in_progress_downloads {
|
||||
TenantState::Paused
|
||||
(TenantState::Paused, None)
|
||||
} else {
|
||||
TenantState::Broken
|
||||
(TenantState::Broken, None)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let current_physical_size =
|
||||
match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false))
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?
|
||||
{
|
||||
Err(err) => {
|
||||
// Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded).
|
||||
// In that case, put a warning message into log and operate normally.
|
||||
warn!("Failed to get local timelines for tenant {tenant_id}: {err}");
|
||||
None
|
||||
}
|
||||
Ok(local_timeline_infos) => Some(
|
||||
local_timeline_infos
|
||||
.into_iter()
|
||||
.fold(0, |acc, x| acc + x.1.current_physical_size.unwrap()),
|
||||
),
|
||||
};
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
TenantInfo {
|
||||
@@ -784,7 +747,7 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "testing", feature = "failpoints"))]
|
||||
#[cfg(feature = "testing")]
|
||||
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
|
||||
@@ -119,32 +119,6 @@ impl<T> TenantTimelineValues<T> {
|
||||
fn new() -> Self {
|
||||
Self(HashMap::new())
|
||||
}
|
||||
|
||||
fn with_capacity(capacity: usize) -> Self {
|
||||
Self(HashMap::with_capacity(capacity))
|
||||
}
|
||||
|
||||
/// A convenience method to map certain values and omit some of them, if needed.
|
||||
/// Tenants that won't have any timeline entries due to the filtering, will still be preserved
|
||||
/// in the structure.
|
||||
fn filter_map<F, NewT>(self, map: F) -> TenantTimelineValues<NewT>
|
||||
where
|
||||
F: Fn(T) -> Option<NewT>,
|
||||
{
|
||||
let capacity = self.0.len();
|
||||
self.0.into_iter().fold(
|
||||
TenantTimelineValues::<NewT>::with_capacity(capacity),
|
||||
|mut new_values, (tenant_id, old_values)| {
|
||||
let new_timeline_values = new_values.0.entry(tenant_id).or_default();
|
||||
for (timeline_id, old_value) in old_values {
|
||||
if let Some(new_value) = map(old_value) {
|
||||
new_timeline_values.insert(timeline_id, new_value);
|
||||
}
|
||||
}
|
||||
new_values
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// A suffix to be used during file sync from the remote storage,
|
||||
@@ -181,35 +155,3 @@ mod backoff_defaults_tests {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn tenant_timeline_value_mapping() {
|
||||
let first_tenant = TenantId::generate();
|
||||
let second_tenant = TenantId::generate();
|
||||
assert_ne!(first_tenant, second_tenant);
|
||||
|
||||
let mut initial = TenantTimelineValues::new();
|
||||
initial
|
||||
.0
|
||||
.entry(first_tenant)
|
||||
.or_default()
|
||||
.insert(TIMELINE_ID, "test_value");
|
||||
let _ = initial.0.entry(second_tenant).or_default();
|
||||
assert_eq!(initial.0.len(), 2, "Should have entries for both tenants");
|
||||
|
||||
let filtered = initial.filter_map(|_| None::<&str>).0;
|
||||
assert_eq!(
|
||||
filtered.len(),
|
||||
2,
|
||||
"Should have entries for both tenants even after filtering away all entries"
|
||||
);
|
||||
assert!(filtered.contains_key(&first_tenant));
|
||||
assert!(filtered.contains_key(&second_tenant));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -107,18 +107,20 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
|
||||
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
|
||||
// or in testing they estimate how much we would upload if we did.
|
||||
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_created_persistent_files_total",
|
||||
"Number of files created that are meant to be uploaded to cloud storage",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_written_persistent_bytes_total",
|
||||
"Total bytes written that are meant to be uploaded to cloud storage",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -275,11 +277,15 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
/// smallest redo processing times. These buckets allow us to measure down
|
||||
/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
|
||||
/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
|
||||
///
|
||||
/// Values up to 1s are recorded because metrics show that we have redo
|
||||
/// durations and lock times larger than 0.250s.
|
||||
macro_rules! redo_histogram_time_buckets {
|
||||
() => {
|
||||
vec![
|
||||
0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
|
||||
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000,
|
||||
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000,
|
||||
1.000_000,
|
||||
]
|
||||
};
|
||||
}
|
||||
@@ -294,6 +300,17 @@ macro_rules! redo_histogram_count_buckets {
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! redo_bytes_histogram_count_buckets {
|
||||
() => {
|
||||
// powers of (2^.5), from 2^4.5 to 2^15 (22 buckets)
|
||||
// rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too.
|
||||
vec![
|
||||
24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0,
|
||||
2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0,
|
||||
]
|
||||
};
|
||||
}
|
||||
|
||||
pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_seconds",
|
||||
@@ -321,6 +338,15 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_bytes_histogram",
|
||||
"Histogram of number of records replayed per redo",
|
||||
redo_bytes_histogram_count_buckets!(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_replayed_wal_records_total",
|
||||
@@ -386,8 +412,12 @@ impl TimelineMetrics {
|
||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED.clone();
|
||||
let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN.clone();
|
||||
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
@@ -419,6 +449,8 @@ impl Drop for TimelineMetrics {
|
||||
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
|
||||
for op in STORAGE_TIME_OPERATIONS {
|
||||
let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
|
||||
@@ -36,8 +36,9 @@
|
||||
//! mapping is automatically removed and the slot is marked free.
|
||||
//!
|
||||
|
||||
use dashmap::mapref::entry::Entry;
|
||||
use dashmap::DashMap;
|
||||
use std::{
|
||||
collections::{hash_map::Entry, HashMap},
|
||||
convert::TryInto,
|
||||
sync::{
|
||||
atomic::{AtomicU8, AtomicUsize, Ordering},
|
||||
@@ -168,18 +169,11 @@ impl Slot {
|
||||
pub struct PageCache {
|
||||
/// This contains the mapping from the cache key to buffer slot that currently
|
||||
/// contains the page, if any.
|
||||
///
|
||||
/// TODO: This is protected by a single lock. If that becomes a bottleneck,
|
||||
/// this HashMap can be replaced with a more concurrent version, there are
|
||||
/// plenty of such crates around.
|
||||
///
|
||||
/// If you add support for caching different kinds of objects, each object kind
|
||||
/// can have a separate mapping map, next to this field.
|
||||
materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
|
||||
materialized_page_map: DashMap<MaterializedPageHashKey, Vec<Version>>,
|
||||
|
||||
ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
|
||||
ephemeral_page_map: DashMap<(u64, u32), usize>,
|
||||
|
||||
immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,
|
||||
immutable_page_map: DashMap<(u64, u32), usize>,
|
||||
|
||||
/// The actual buffers with their metadata.
|
||||
slots: Box<[Slot]>,
|
||||
@@ -616,7 +610,7 @@ impl PageCache {
|
||||
fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
|
||||
match cache_key {
|
||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||
let map = self.materialized_page_map.read().unwrap();
|
||||
let map = &self.materialized_page_map;
|
||||
let versions = map.get(hash_key)?;
|
||||
|
||||
let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
|
||||
@@ -629,11 +623,11 @@ impl PageCache {
|
||||
Some(version.slot_idx)
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let map = self.ephemeral_page_map.read().unwrap();
|
||||
let map = &self.ephemeral_page_map;
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let map = self.immutable_page_map.read().unwrap();
|
||||
let map = &self.immutable_page_map;
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
}
|
||||
@@ -646,7 +640,7 @@ impl PageCache {
|
||||
fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
|
||||
match key {
|
||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||
let map = self.materialized_page_map.read().unwrap();
|
||||
let map = &self.materialized_page_map;
|
||||
let versions = map.get(hash_key)?;
|
||||
|
||||
if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
|
||||
@@ -656,11 +650,11 @@ impl PageCache {
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let map = self.ephemeral_page_map.read().unwrap();
|
||||
let map = &self.ephemeral_page_map;
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let map = self.immutable_page_map.read().unwrap();
|
||||
let map = &self.immutable_page_map;
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
}
|
||||
@@ -675,7 +669,7 @@ impl PageCache {
|
||||
hash_key: old_hash_key,
|
||||
lsn: old_lsn,
|
||||
} => {
|
||||
let mut map = self.materialized_page_map.write().unwrap();
|
||||
let map = &self.materialized_page_map;
|
||||
if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
|
||||
let versions = old_entry.get_mut();
|
||||
|
||||
@@ -690,12 +684,12 @@ impl PageCache {
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
||||
let map = &self.ephemeral_page_map;
|
||||
map.remove(&(*file_id, *blkno))
|
||||
.expect("could not find old key in mapping");
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let mut map = self.immutable_page_map.write().unwrap();
|
||||
let map = &self.immutable_page_map;
|
||||
map.remove(&(*file_id, *blkno))
|
||||
.expect("could not find old key in mapping");
|
||||
}
|
||||
@@ -713,8 +707,8 @@ impl PageCache {
|
||||
hash_key: new_key,
|
||||
lsn: new_lsn,
|
||||
} => {
|
||||
let mut map = self.materialized_page_map.write().unwrap();
|
||||
let versions = map.entry(new_key.clone()).or_default();
|
||||
let map = &self.materialized_page_map;
|
||||
let mut versions = map.entry(new_key.clone()).or_default();
|
||||
match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
|
||||
Ok(version_idx) => Some(versions[version_idx].slot_idx),
|
||||
Err(version_idx) => {
|
||||
@@ -730,7 +724,7 @@ impl PageCache {
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
||||
let map = &self.ephemeral_page_map;
|
||||
match map.entry((*file_id, *blkno)) {
|
||||
Entry::Occupied(entry) => Some(*entry.get()),
|
||||
Entry::Vacant(entry) => {
|
||||
@@ -740,7 +734,7 @@ impl PageCache {
|
||||
}
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let mut map = self.immutable_page_map.write().unwrap();
|
||||
let map = &self.immutable_page_map;
|
||||
match map.entry((*file_id, *blkno)) {
|
||||
Entry::Occupied(entry) => Some(*entry.get()),
|
||||
Entry::Vacant(entry) => {
|
||||
|
||||
@@ -169,9 +169,14 @@ use self::{
|
||||
upload::{upload_index_part, upload_timeline_layers, UploadedTimeline},
|
||||
};
|
||||
use crate::{
|
||||
config::PageServerConf, exponential_backoff, storage_sync::index::RemoteIndex, task_mgr,
|
||||
task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, tenant::metadata::TimelineMetadata,
|
||||
tenant_mgr::attach_local_tenants,
|
||||
config::PageServerConf,
|
||||
exponential_backoff,
|
||||
storage_sync::index::{LayerFileMetadata, RemoteIndex},
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::BACKGROUND_RUNTIME,
|
||||
tenant::metadata::TimelineMetadata,
|
||||
tenant_mgr::{attach_local_tenants, TenantAttachData},
|
||||
};
|
||||
use crate::{
|
||||
metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD},
|
||||
@@ -188,7 +193,7 @@ static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();
|
||||
|
||||
/// A timeline status to share with pageserver's sync counterpart,
|
||||
/// after comparing local and remote timeline state.
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub enum LocalTimelineInitStatus {
|
||||
/// The timeline has every remote layer present locally.
|
||||
/// There could be some layers requiring uploading,
|
||||
@@ -311,7 +316,7 @@ impl SyncQueue {
|
||||
|
||||
/// A task to run in the async download/upload loop.
|
||||
/// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled.
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
enum SyncTask {
|
||||
/// A checkpoint outcome with possible local file updates that need actualization in the remote storage.
|
||||
/// Not necessary more fresh than the one already uploaded.
|
||||
@@ -422,7 +427,7 @@ impl SyncTaskBatch {
|
||||
.extend(new_delete.data.deleted_layers.iter().cloned());
|
||||
}
|
||||
if let Some(batch_upload) = &mut self.upload {
|
||||
let not_deleted = |layer: &PathBuf| {
|
||||
let not_deleted = |layer: &PathBuf, _: &mut LayerFileMetadata| {
|
||||
!new_delete.data.layers_to_delete.contains(layer)
|
||||
&& !new_delete.data.deleted_layers.contains(layer)
|
||||
};
|
||||
@@ -450,21 +455,35 @@ impl SyncTaskBatch {
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
struct LayersUpload {
|
||||
/// Layer file path in the pageserver workdir, that were added for the corresponding checkpoint.
|
||||
layers_to_upload: HashSet<PathBuf>,
|
||||
layers_to_upload: HashMap<PathBuf, LayerFileMetadata>,
|
||||
/// Already uploaded layers. Used to store the data about the uploads between task retries
|
||||
/// and to record the data into the remote index after the task got completed or evicted.
|
||||
uploaded_layers: HashSet<PathBuf>,
|
||||
uploaded_layers: HashMap<PathBuf, LayerFileMetadata>,
|
||||
metadata: Option<TimelineMetadata>,
|
||||
}
|
||||
|
||||
/// A timeline download task.
|
||||
/// Does not contain the file list to download, to allow other
|
||||
/// parts of the pageserer code to schedule the task
|
||||
/// without using the remote index or any other ways to list the remote timleine files.
|
||||
/// without using the remote index or any other ways to list the remote timeline files.
|
||||
/// Skips the files that are already downloaded.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
struct LayersDownload {
|
||||
layers_to_skip: HashSet<PathBuf>,
|
||||
|
||||
/// Paths which have been downloaded, and had their metadata verified or generated.
|
||||
///
|
||||
/// Metadata generation happens when upgrading from past version of `IndexPart`.
|
||||
gathered_metadata: HashMap<PathBuf, LayerFileMetadata>,
|
||||
}
|
||||
|
||||
impl LayersDownload {
|
||||
fn from_skipped_layers(layers_to_skip: HashSet<PathBuf>) -> Self {
|
||||
LayersDownload {
|
||||
layers_to_skip,
|
||||
gathered_metadata: HashMap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
@@ -486,7 +505,7 @@ struct LayersDeletion {
|
||||
pub fn schedule_layer_upload(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
layers_to_upload: HashSet<PathBuf>,
|
||||
layers_to_upload: HashMap<PathBuf, LayerFileMetadata>,
|
||||
metadata: Option<TimelineMetadata>,
|
||||
) {
|
||||
let sync_queue = match SYNC_QUEUE.get() {
|
||||
@@ -503,7 +522,7 @@ pub fn schedule_layer_upload(
|
||||
},
|
||||
SyncTask::upload(LayersUpload {
|
||||
layers_to_upload,
|
||||
uploaded_layers: HashSet::new(),
|
||||
uploaded_layers: HashMap::new(),
|
||||
metadata,
|
||||
}),
|
||||
);
|
||||
@@ -561,18 +580,44 @@ pub fn schedule_layer_download(tenant_id: TenantId, timeline_id: TimelineId) {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
SyncTask::download(LayersDownload {
|
||||
layers_to_skip: HashSet::new(),
|
||||
}),
|
||||
SyncTask::download(LayersDownload::from_skipped_layers(HashSet::new())),
|
||||
);
|
||||
debug!("Download task for tenant {tenant_id}, timeline {timeline_id} sent")
|
||||
}
|
||||
|
||||
/// Local existing timeline files
|
||||
///
|
||||
/// Values of this type serve different meanings in different contexts. On startup, collected
|
||||
/// timelines come with the full collected information and when signalling readyness to attach
|
||||
/// after completed download. After the download the file information is no longer carried, because
|
||||
/// it is already merged into [`RemoteTimeline`].
|
||||
#[derive(Debug)]
|
||||
pub struct TimelineLocalFiles(TimelineMetadata, HashMap<PathBuf, LayerFileMetadata>);
|
||||
|
||||
impl TimelineLocalFiles {
|
||||
pub fn metadata(&self) -> &TimelineMetadata {
|
||||
&self.0
|
||||
}
|
||||
|
||||
/// Called during startup, for all of the local files with full metadata.
|
||||
pub(crate) fn collected(
|
||||
metadata: TimelineMetadata,
|
||||
timeline_files: HashMap<PathBuf, LayerFileMetadata>,
|
||||
) -> TimelineLocalFiles {
|
||||
TimelineLocalFiles(metadata, timeline_files)
|
||||
}
|
||||
|
||||
/// Called near the end of tenant initialization, to signal readyness to attach tenants.
|
||||
pub(crate) fn ready(metadata: TimelineMetadata) -> Self {
|
||||
TimelineLocalFiles(metadata, HashMap::new())
|
||||
}
|
||||
}
|
||||
|
||||
/// Launch a thread to perform remote storage sync tasks.
|
||||
/// See module docs for loop step description.
|
||||
pub fn spawn_storage_sync_task(
|
||||
conf: &'static PageServerConf,
|
||||
local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet<PathBuf>)>,
|
||||
local_timeline_files: HashMap<TenantId, HashMap<TimelineId, TimelineLocalFiles>>,
|
||||
storage: GenericRemoteStorage,
|
||||
max_concurrent_timelines_sync: NonZeroUsize,
|
||||
max_sync_errors: NonZeroU32,
|
||||
@@ -595,7 +640,7 @@ pub fn spawn_storage_sync_task(
|
||||
let mut keys_for_index_part_downloads = HashSet::new();
|
||||
let mut timelines_to_sync = HashMap::new();
|
||||
|
||||
for (tenant_id, timeline_data) in local_timeline_files.0 {
|
||||
for (tenant_id, timeline_data) in local_timeline_files {
|
||||
if timeline_data.is_empty() {
|
||||
info!("got empty tenant {}", tenant_id);
|
||||
let _ = empty_tenants.0.entry(tenant_id).or_default();
|
||||
@@ -698,7 +743,7 @@ async fn storage_sync_loop(
|
||||
"Sync loop step completed, {} new tenant state update(s)",
|
||||
updated_tenants.len()
|
||||
);
|
||||
let mut timelines_to_attach = TenantTimelineValues::new();
|
||||
let mut timelines_to_attach = HashMap::new();
|
||||
let index_accessor = index.read().await;
|
||||
for tenant_id in updated_tenants {
|
||||
let tenant_entry = match index_accessor.tenant_entry(&tenant_id) {
|
||||
@@ -724,12 +769,16 @@ async fn storage_sync_loop(
|
||||
// and register them all at once in a tenant for download
|
||||
// to be submitted in a single operation to tenant
|
||||
// so it can apply them at once to internal timeline map.
|
||||
timelines_to_attach.0.insert(
|
||||
timelines_to_attach.insert(
|
||||
tenant_id,
|
||||
tenant_entry
|
||||
.iter()
|
||||
.map(|(&id, entry)| (id, entry.metadata.clone()))
|
||||
.collect(),
|
||||
TenantAttachData::Ready(
|
||||
tenant_entry
|
||||
.iter()
|
||||
.map(|(&id, entry)| {
|
||||
(id, TimelineLocalFiles::ready(entry.metadata.clone()))
|
||||
})
|
||||
.collect(),
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -971,15 +1020,27 @@ async fn download_timeline_data(
|
||||
}
|
||||
DownloadedTimeline::Successful(mut download_data) => {
|
||||
match update_local_metadata(conf, sync_id, current_remote_timeline).await {
|
||||
Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
|
||||
Ok(()) => {
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
|
||||
return DownloadStatus::Downloaded;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
|
||||
}
|
||||
},
|
||||
Ok(()) => {
|
||||
let mut g = index.write().await;
|
||||
|
||||
match g.set_awaits_download(&sync_id, false) {
|
||||
Ok(()) => {
|
||||
let timeline = g
|
||||
.timeline_entry_mut(&sync_id)
|
||||
.expect("set_awaits_download verified existence");
|
||||
|
||||
timeline.merge_metadata_from_downloaded(
|
||||
&download_data.data.gathered_metadata,
|
||||
);
|
||||
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
|
||||
return DownloadStatus::Downloaded;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
|
||||
}
|
||||
};
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to update local timeline metadata: {e:?}");
|
||||
download_data.retries += 1;
|
||||
@@ -1182,11 +1243,18 @@ async fn update_remote_data(
|
||||
}
|
||||
if upload_failed {
|
||||
existing_entry.add_upload_failures(
|
||||
uploaded_data.layers_to_upload.iter().cloned(),
|
||||
uploaded_data
|
||||
.layers_to_upload
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_owned(), v.to_owned())),
|
||||
);
|
||||
} else {
|
||||
existing_entry
|
||||
.add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned());
|
||||
existing_entry.add_timeline_layers(
|
||||
uploaded_data
|
||||
.uploaded_layers
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_owned(), v.to_owned())),
|
||||
);
|
||||
}
|
||||
}
|
||||
RemoteDataUpdate::Delete(layers_to_remove) => {
|
||||
@@ -1206,11 +1274,19 @@ async fn update_remote_data(
|
||||
};
|
||||
let mut new_remote_timeline = RemoteTimeline::new(new_metadata.clone());
|
||||
if upload_failed {
|
||||
new_remote_timeline
|
||||
.add_upload_failures(uploaded_data.layers_to_upload.iter().cloned());
|
||||
new_remote_timeline.add_upload_failures(
|
||||
uploaded_data
|
||||
.layers_to_upload
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_owned(), v.to_owned())),
|
||||
);
|
||||
} else {
|
||||
new_remote_timeline
|
||||
.add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned());
|
||||
new_remote_timeline.add_timeline_layers(
|
||||
uploaded_data
|
||||
.uploaded_layers
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_owned(), v.to_owned())),
|
||||
);
|
||||
}
|
||||
|
||||
index_accessor.add_timeline_entry(sync_id, new_remote_timeline.clone());
|
||||
@@ -1258,13 +1334,14 @@ async fn validate_task_retries(
|
||||
fn schedule_first_sync_tasks(
|
||||
index: &mut RemoteTimelineIndex,
|
||||
sync_queue: &SyncQueue,
|
||||
local_timeline_files: HashMap<TenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
|
||||
local_timeline_files: HashMap<TenantTimelineId, TimelineLocalFiles>,
|
||||
) -> TenantTimelineValues<LocalTimelineInitStatus> {
|
||||
let mut local_timeline_init_statuses = TenantTimelineValues::new();
|
||||
|
||||
let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len());
|
||||
|
||||
for (sync_id, (local_metadata, local_files)) in local_timeline_files {
|
||||
for (sync_id, local_timeline) in local_timeline_files {
|
||||
let TimelineLocalFiles(local_metadata, local_files) = local_timeline;
|
||||
match index.timeline_entry_mut(&sync_id) {
|
||||
Some(remote_timeline) => {
|
||||
let (timeline_status, awaits_download) = compare_local_and_remote_timeline(
|
||||
@@ -1308,7 +1385,7 @@ fn schedule_first_sync_tasks(
|
||||
sync_id,
|
||||
SyncTask::upload(LayersUpload {
|
||||
layers_to_upload: local_files,
|
||||
uploaded_layers: HashSet::new(),
|
||||
uploaded_layers: HashMap::new(),
|
||||
metadata: Some(local_metadata.clone()),
|
||||
}),
|
||||
));
|
||||
@@ -1335,20 +1412,46 @@ fn compare_local_and_remote_timeline(
|
||||
new_sync_tasks: &mut VecDeque<(TenantTimelineId, SyncTask)>,
|
||||
sync_id: TenantTimelineId,
|
||||
local_metadata: TimelineMetadata,
|
||||
local_files: HashSet<PathBuf>,
|
||||
local_files: HashMap<PathBuf, LayerFileMetadata>,
|
||||
remote_entry: &RemoteTimeline,
|
||||
) -> (LocalTimelineInitStatus, bool) {
|
||||
let _entered = info_span!("compare_local_and_remote_timeline", sync_id = %sync_id).entered();
|
||||
|
||||
let remote_files = remote_entry.stored_files();
|
||||
let needed_to_download_files = remote_entry
|
||||
.stored_files()
|
||||
.iter()
|
||||
.filter_map(|(layer_file, remote_metadata)| {
|
||||
if let Some(local_metadata) = local_files.get(layer_file) {
|
||||
match (remote_metadata.file_size(), local_metadata.file_size()) {
|
||||
(Some(x), Some(y)) if x == y => { None },
|
||||
(None, Some(_)) => {
|
||||
// upgrading from an earlier IndexPart without metadata
|
||||
None
|
||||
},
|
||||
_ => {
|
||||
// having to deal with other than (Some(x), Some(y)) where x != y here is a
|
||||
// bummer, but see #2582 and #2610 for attempts and discussion.
|
||||
warn!("Redownloading locally existing {layer_file:?} due to size mismatch, size on index: {:?}, on disk: {:?}", remote_metadata.file_size(), local_metadata.file_size());
|
||||
Some(layer_file)
|
||||
},
|
||||
}
|
||||
} else {
|
||||
// doesn't exist locally
|
||||
Some(layer_file)
|
||||
}
|
||||
})
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let number_of_layers_to_download = remote_files.difference(&local_files).count();
|
||||
let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 {
|
||||
let (initial_timeline_status, awaits_download) = if !needed_to_download_files.is_empty() {
|
||||
new_sync_tasks.push_back((
|
||||
sync_id,
|
||||
SyncTask::download(LayersDownload {
|
||||
layers_to_skip: local_files.clone(),
|
||||
}),
|
||||
SyncTask::download(LayersDownload::from_skipped_layers(
|
||||
local_files
|
||||
.keys()
|
||||
.filter(|path| !needed_to_download_files.contains(path))
|
||||
.cloned()
|
||||
.collect(),
|
||||
)),
|
||||
));
|
||||
info!("NeedsSync");
|
||||
(LocalTimelineInitStatus::NeedsSync, true)
|
||||
@@ -1363,15 +1466,22 @@ fn compare_local_and_remote_timeline(
|
||||
};
|
||||
|
||||
let layers_to_upload = local_files
|
||||
.difference(remote_files)
|
||||
.cloned()
|
||||
.collect::<HashSet<_>>();
|
||||
.iter()
|
||||
.filter_map(|(local_file, metadata)| {
|
||||
if !remote_entry.stored_files().contains_key(local_file) {
|
||||
Some((local_file.to_owned(), metadata.to_owned()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
if !layers_to_upload.is_empty() {
|
||||
new_sync_tasks.push_back((
|
||||
sync_id,
|
||||
SyncTask::upload(LayersUpload {
|
||||
layers_to_upload,
|
||||
uploaded_layers: HashSet::new(),
|
||||
uploaded_layers: HashMap::new(),
|
||||
metadata: Some(local_metadata),
|
||||
}),
|
||||
));
|
||||
@@ -1427,11 +1537,12 @@ mod test_utils {
|
||||
let timeline_path = harness.timeline_path(&timeline_id);
|
||||
fs::create_dir_all(&timeline_path).await?;
|
||||
|
||||
let mut layers_to_upload = HashSet::with_capacity(filenames.len());
|
||||
let mut layers_to_upload = HashMap::with_capacity(filenames.len());
|
||||
for &file in filenames {
|
||||
let file_path = timeline_path.join(file);
|
||||
fs::write(&file_path, dummy_contents(file).into_bytes()).await?;
|
||||
layers_to_upload.insert(file_path);
|
||||
let metadata = LayerFileMetadata::new(file_path.metadata()?.len());
|
||||
layers_to_upload.insert(file_path, metadata);
|
||||
}
|
||||
|
||||
fs::write(
|
||||
@@ -1442,7 +1553,7 @@ mod test_utils {
|
||||
|
||||
Ok(LayersUpload {
|
||||
layers_to_upload,
|
||||
uploaded_layers: HashSet::new(),
|
||||
uploaded_layers: HashMap::new(),
|
||||
metadata: Some(metadata),
|
||||
})
|
||||
}
|
||||
@@ -1497,12 +1608,13 @@ mod tests {
|
||||
assert!(sync_id_2 != sync_id_3);
|
||||
assert!(sync_id_3 != TEST_SYNC_ID);
|
||||
|
||||
let download_task = SyncTask::download(LayersDownload {
|
||||
layers_to_skip: HashSet::from([PathBuf::from("sk")]),
|
||||
});
|
||||
let download_task =
|
||||
SyncTask::download(LayersDownload::from_skipped_layers(HashSet::from([
|
||||
PathBuf::from("sk"),
|
||||
])));
|
||||
let upload_task = SyncTask::upload(LayersUpload {
|
||||
layers_to_upload: HashSet::from([PathBuf::from("up")]),
|
||||
uploaded_layers: HashSet::from([PathBuf::from("upl")]),
|
||||
layers_to_upload: HashMap::from([(PathBuf::from("up"), LayerFileMetadata::new(123))]),
|
||||
uploaded_layers: HashMap::from([(PathBuf::from("upl"), LayerFileMetadata::new(123))]),
|
||||
metadata: Some(dummy_metadata(Lsn(2))),
|
||||
});
|
||||
let delete_task = SyncTask::delete(LayersDeletion {
|
||||
@@ -1546,12 +1658,10 @@ mod tests {
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
assert_eq!(sync_queue.len(), 0);
|
||||
|
||||
let download = LayersDownload {
|
||||
layers_to_skip: HashSet::from([PathBuf::from("sk")]),
|
||||
};
|
||||
let download = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk")]));
|
||||
let upload = LayersUpload {
|
||||
layers_to_upload: HashSet::from([PathBuf::from("up")]),
|
||||
uploaded_layers: HashSet::from([PathBuf::from("upl")]),
|
||||
layers_to_upload: HashMap::from([(PathBuf::from("up"), LayerFileMetadata::new(123))]),
|
||||
uploaded_layers: HashMap::from([(PathBuf::from("upl"), LayerFileMetadata::new(123))]),
|
||||
metadata: Some(dummy_metadata(Lsn(2))),
|
||||
};
|
||||
let delete = LayersDeletion {
|
||||
@@ -1599,18 +1709,10 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn same_task_id_same_tasks_batch() {
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(1).unwrap());
|
||||
let download_1 = LayersDownload {
|
||||
layers_to_skip: HashSet::from([PathBuf::from("sk1")]),
|
||||
};
|
||||
let download_2 = LayersDownload {
|
||||
layers_to_skip: HashSet::from([PathBuf::from("sk2")]),
|
||||
};
|
||||
let download_3 = LayersDownload {
|
||||
layers_to_skip: HashSet::from([PathBuf::from("sk3")]),
|
||||
};
|
||||
let download_4 = LayersDownload {
|
||||
layers_to_skip: HashSet::from([PathBuf::from("sk4")]),
|
||||
};
|
||||
let download_1 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk1")]));
|
||||
let download_2 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk2")]));
|
||||
let download_3 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk3")]));
|
||||
let download_4 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk4")]));
|
||||
|
||||
let sync_id_2 = TenantTimelineId {
|
||||
tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")),
|
||||
@@ -1634,15 +1736,15 @@ mod tests {
|
||||
Some(SyncTaskBatch {
|
||||
download: Some(SyncData {
|
||||
retries: 0,
|
||||
data: LayersDownload {
|
||||
layers_to_skip: {
|
||||
data: LayersDownload::from_skipped_layers(
|
||||
{
|
||||
let mut set = HashSet::new();
|
||||
set.extend(download_1.layers_to_skip.into_iter());
|
||||
set.extend(download_2.layers_to_skip.into_iter());
|
||||
set.extend(download_4.layers_to_skip.into_iter());
|
||||
set
|
||||
},
|
||||
}
|
||||
)
|
||||
}),
|
||||
upload: None,
|
||||
delete: None,
|
||||
@@ -1658,4 +1760,148 @@ mod tests {
|
||||
"Should have one task left out of the batch"
|
||||
);
|
||||
}
|
||||
|
||||
mod local_and_remote_comparisons {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn ready() {
|
||||
let mut new_sync_tasks = VecDeque::default();
|
||||
let sync_id = TenantTimelineId::generate();
|
||||
let local_metadata = dummy_metadata(0x02.into());
|
||||
let local_files =
|
||||
HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
|
||||
let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
|
||||
remote_entry
|
||||
.add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
|
||||
|
||||
let (status, sync_needed) = compare_local_and_remote_timeline(
|
||||
&mut new_sync_tasks,
|
||||
sync_id,
|
||||
local_metadata.clone(),
|
||||
local_files,
|
||||
&remote_entry,
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
status,
|
||||
LocalTimelineInitStatus::LocallyComplete(local_metadata)
|
||||
);
|
||||
assert!(!sync_needed);
|
||||
|
||||
assert!(new_sync_tasks.is_empty(), "{:?}", new_sync_tasks);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn needs_download() {
|
||||
let mut new_sync_tasks = VecDeque::default();
|
||||
let sync_id = TenantTimelineId::generate();
|
||||
let local_metadata = dummy_metadata(0x02.into());
|
||||
let local_files = HashMap::default();
|
||||
let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
|
||||
remote_entry
|
||||
.add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
|
||||
|
||||
let (status, sync_needed) = compare_local_and_remote_timeline(
|
||||
&mut new_sync_tasks,
|
||||
sync_id,
|
||||
local_metadata,
|
||||
local_files.clone(),
|
||||
&remote_entry,
|
||||
);
|
||||
|
||||
assert_eq!(status, LocalTimelineInitStatus::NeedsSync);
|
||||
assert!(sync_needed);
|
||||
|
||||
let new_sync_tasks = new_sync_tasks.into_iter().collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(
|
||||
&new_sync_tasks,
|
||||
&[(
|
||||
sync_id,
|
||||
SyncTask::download(LayersDownload::from_skipped_layers(
|
||||
local_files.keys().cloned().collect()
|
||||
))
|
||||
)]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn redownload_is_not_needed_on_upgrade() {
|
||||
// originally the implementation missed the `(None, Some(_))` case in the match, and
|
||||
// proceeded to always redownload if the remote metadata was not available.
|
||||
|
||||
let mut new_sync_tasks = VecDeque::default();
|
||||
let sync_id = TenantTimelineId::generate();
|
||||
|
||||
let local_metadata = dummy_metadata(0x02.into());
|
||||
|
||||
// type system would in general allow that LayerFileMetadata would be created with
|
||||
// file_size: None, however `LayerFileMetadata::default` is only allowed from tests,
|
||||
// and so everywhere within the system valid LayerFileMetadata is being created, it is
|
||||
// created through `::new`.
|
||||
let local_files =
|
||||
HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
|
||||
|
||||
let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
|
||||
|
||||
// RemoteTimeline is constructed out of an older version IndexPart, which didn't carry
|
||||
// any metadata.
|
||||
remote_entry
|
||||
.add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::default())]);
|
||||
|
||||
let (status, sync_needed) = compare_local_and_remote_timeline(
|
||||
&mut new_sync_tasks,
|
||||
sync_id,
|
||||
local_metadata.clone(),
|
||||
local_files,
|
||||
&remote_entry,
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
status,
|
||||
LocalTimelineInitStatus::LocallyComplete(local_metadata)
|
||||
);
|
||||
assert!(!sync_needed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn needs_upload() {
|
||||
let mut new_sync_tasks = VecDeque::default();
|
||||
let sync_id = TenantTimelineId::generate();
|
||||
let local_metadata = dummy_metadata(0x02.into());
|
||||
let local_files =
|
||||
HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
|
||||
let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
|
||||
remote_entry.add_timeline_layers([]);
|
||||
|
||||
let (status, sync_needed) = compare_local_and_remote_timeline(
|
||||
&mut new_sync_tasks,
|
||||
sync_id,
|
||||
local_metadata.clone(),
|
||||
local_files.clone(),
|
||||
&remote_entry,
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
status,
|
||||
LocalTimelineInitStatus::LocallyComplete(local_metadata.clone())
|
||||
);
|
||||
assert!(!sync_needed);
|
||||
|
||||
let new_sync_tasks = new_sync_tasks.into_iter().collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(
|
||||
&new_sync_tasks,
|
||||
&[(
|
||||
sync_id,
|
||||
SyncTask::upload(LayersUpload {
|
||||
layers_to_upload: local_files,
|
||||
uploaded_layers: HashMap::default(),
|
||||
metadata: Some(local_metadata),
|
||||
})
|
||||
)]
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -171,7 +171,7 @@ mod tests {
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
for local_path in timeline_upload.layers_to_upload {
|
||||
for (local_path, _metadata) in timeline_upload.layers_to_upload {
|
||||
let remote_path =
|
||||
local_storage.resolve_in_storage(&local_storage.remote_object_id(&local_path)?)?;
|
||||
let remote_parent_dir = remote_path.parent().unwrap();
|
||||
|
||||
@@ -16,7 +16,11 @@ use tokio::{
|
||||
};
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX};
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
storage_sync::{index::LayerFileMetadata, SyncTask},
|
||||
TEMP_FILE_SUFFIX,
|
||||
};
|
||||
use utils::{
|
||||
crashsafe_dir::path_with_suffix_extension,
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -219,8 +223,14 @@ pub(super) async fn download_timeline_layers<'a>(
|
||||
|
||||
let layers_to_download = remote_timeline
|
||||
.stored_files()
|
||||
.difference(&download.layers_to_skip)
|
||||
.cloned()
|
||||
.iter()
|
||||
.filter_map(|(layer_path, metadata)| {
|
||||
if !download.layers_to_skip.contains(layer_path) {
|
||||
Some((layer_path.to_owned(), metadata.to_owned()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
debug!("Layers to download: {layers_to_download:?}");
|
||||
@@ -233,89 +243,129 @@ pub(super) async fn download_timeline_layers<'a>(
|
||||
|
||||
let mut download_tasks = layers_to_download
|
||||
.into_iter()
|
||||
.map(|layer_destination_path| async move {
|
||||
if layer_destination_path.exists() {
|
||||
debug!(
|
||||
"Layer already exists locally, skipping download: {}",
|
||||
layer_destination_path.display()
|
||||
);
|
||||
} else {
|
||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||
// The sequence:
|
||||
// write(tmp)
|
||||
// fsync(tmp)
|
||||
// rename(tmp, new)
|
||||
// fsync(new)
|
||||
// fsync(parent)
|
||||
// For more context about durable_rename check this email from postgres mailing list:
|
||||
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path =
|
||||
path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX);
|
||||
.map(|(layer_destination_path, metadata)| async move {
|
||||
|
||||
let mut destination_file =
|
||||
fs::File::create(&temp_file_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create a destination file for layer '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
match layer_destination_path.metadata() {
|
||||
Ok(m) if m.is_file() => {
|
||||
// the file exists from earlier round when we failed after renaming it as
|
||||
// layer_destination_path
|
||||
let verified = if let Some(expected) = metadata.file_size() {
|
||||
m.len() == expected
|
||||
} else {
|
||||
// behaviour before recording metadata was to accept any existing
|
||||
true
|
||||
};
|
||||
|
||||
let mut layer_download = storage.download_storage_object(None, &layer_destination_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to initiate the download the layer for {sync_id} into file '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
io::copy(&mut layer_download.download_stream, &mut destination_file)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download the layer for {sync_id} into file '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because
|
||||
// we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations.
|
||||
// But for additional safety let's check/wait for any pending operations.
|
||||
destination_file.flush().await.with_context(|| {
|
||||
format!(
|
||||
"failed to flush source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file.sync_all().await.with_context(|| {
|
||||
format!(
|
||||
"failed to fsync source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
drop(destination_file);
|
||||
|
||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
anyhow::bail!("remote-storage-download-pre-rename failpoint triggered")
|
||||
});
|
||||
|
||||
fs::rename(&temp_file_path, &layer_destination_path).await?;
|
||||
|
||||
fsync_path(&layer_destination_path).await.with_context(|| {
|
||||
format!(
|
||||
"Cannot fsync layer destination path {}",
|
||||
layer_destination_path.display(),
|
||||
)
|
||||
})?;
|
||||
if verified {
|
||||
debug!(
|
||||
"Layer already exists locally, skipping download: {}",
|
||||
layer_destination_path.display()
|
||||
);
|
||||
return Ok((layer_destination_path, LayerFileMetadata::new(m.len())))
|
||||
} else {
|
||||
// no need to remove it, it will be overwritten by fs::rename
|
||||
// after successful download
|
||||
warn!("Downloaded layer exists already but layer file metadata mismatches: {}, metadata {:?}", layer_destination_path.display(), metadata);
|
||||
}
|
||||
}
|
||||
Ok(m) => {
|
||||
return Err(anyhow::anyhow!("Downloaded layer destination exists but is not a file: {m:?}, target needs to be removed/archived manually: {layer_destination_path:?}"));
|
||||
}
|
||||
Err(_) => {
|
||||
// behave as the file didn't exist
|
||||
}
|
||||
}
|
||||
Ok::<_, anyhow::Error>(layer_destination_path)
|
||||
|
||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||
// The sequence:
|
||||
// write(tmp)
|
||||
// fsync(tmp)
|
||||
// rename(tmp, new)
|
||||
// fsync(new)
|
||||
// fsync(parent)
|
||||
// For more context about durable_rename check this email from postgres mailing list:
|
||||
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path =
|
||||
path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX);
|
||||
|
||||
// TODO: this doesn't use the cached fd for some reason?
|
||||
let mut destination_file =
|
||||
fs::File::create(&temp_file_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create a destination file for layer '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut layer_download = storage.download_storage_object(None, &layer_destination_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to initiate the download the layer for {sync_id} into file '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let bytes_amount = io::copy(&mut layer_download.download_stream, &mut destination_file)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download the layer for {sync_id} into file '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because
|
||||
// we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations.
|
||||
// But for additional safety let's check/wait for any pending operations.
|
||||
destination_file.flush().await.with_context(|| {
|
||||
format!(
|
||||
"failed to flush source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
match metadata.file_size() {
|
||||
Some(expected) if expected != bytes_amount => {
|
||||
anyhow::bail!(
|
||||
"According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
|
||||
temp_file_path.display()
|
||||
);
|
||||
},
|
||||
Some(_) | None => {
|
||||
// matches, or upgrading from an earlier IndexPart version
|
||||
}
|
||||
}
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file.sync_all().await.with_context(|| {
|
||||
format!(
|
||||
"failed to fsync source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
drop(destination_file);
|
||||
|
||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
anyhow::bail!("remote-storage-download-pre-rename failpoint triggered")
|
||||
});
|
||||
|
||||
fs::rename(&temp_file_path, &layer_destination_path).await?;
|
||||
|
||||
fsync_path(&layer_destination_path).await.with_context(|| {
|
||||
format!(
|
||||
"Cannot fsync layer destination path {}",
|
||||
layer_destination_path.display(),
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok::<_, anyhow::Error>((layer_destination_path, LayerFileMetadata::new(bytes_amount)))
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
@@ -324,9 +374,12 @@ pub(super) async fn download_timeline_layers<'a>(
|
||||
let mut undo = HashSet::new();
|
||||
while let Some(download_result) = download_tasks.next().await {
|
||||
match download_result {
|
||||
Ok(downloaded_path) => {
|
||||
Ok((downloaded_path, metadata)) => {
|
||||
undo.insert(downloaded_path.clone());
|
||||
download.layers_to_skip.insert(downloaded_path);
|
||||
download.layers_to_skip.insert(downloaded_path.clone());
|
||||
// what if the key existed already? ignore, because then we would had
|
||||
// downloaded a partial file, and had to retry
|
||||
download.gathered_metadata.insert(downloaded_path, metadata);
|
||||
}
|
||||
Err(e) => {
|
||||
errors_happened = true;
|
||||
@@ -349,6 +402,8 @@ pub(super) async fn download_timeline_layers<'a>(
|
||||
);
|
||||
for item in undo {
|
||||
download.layers_to_skip.remove(&item);
|
||||
// intentionally don't clear the gathered_metadata because it exists for fsync_path
|
||||
// failure on parent directory
|
||||
}
|
||||
errors_happened = true;
|
||||
}
|
||||
@@ -453,9 +508,9 @@ mod tests {
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
|
||||
for local_path in timeline_upload.layers_to_upload {
|
||||
for local_path in timeline_upload.layers_to_upload.keys() {
|
||||
let remote_path =
|
||||
local_storage.resolve_in_storage(&storage.remote_object_id(&local_path)?)?;
|
||||
local_storage.resolve_in_storage(&storage.remote_object_id(local_path)?)?;
|
||||
let remote_parent_dir = remote_path.parent().unwrap();
|
||||
if !remote_parent_dir.exists() {
|
||||
fs::create_dir_all(&remote_parent_dir).await?;
|
||||
@@ -473,11 +528,19 @@ mod tests {
|
||||
|
||||
let mut remote_timeline = RemoteTimeline::new(metadata.clone());
|
||||
remote_timeline.awaits_download = true;
|
||||
remote_timeline.add_timeline_layers(
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer| local_timeline_path.join(layer)),
|
||||
);
|
||||
remote_timeline.add_timeline_layers(layer_files.iter().map(|layer| {
|
||||
let layer_path = local_timeline_path.join(layer);
|
||||
|
||||
// this could had also been LayerFileMetadata::default(), but since in this test we
|
||||
// don't do the merge operation done by storage_sync::download_timeline_data, it would
|
||||
// not be merged back to timeline.
|
||||
let metadata_from_upload = timeline_upload
|
||||
.layers_to_upload
|
||||
.get(&layer_path)
|
||||
.expect("layer must exist in previously uploaded paths")
|
||||
.to_owned();
|
||||
(layer_path, metadata_from_upload)
|
||||
}));
|
||||
|
||||
let download_data = match download_timeline_layers(
|
||||
harness.conf,
|
||||
@@ -487,9 +550,9 @@ mod tests {
|
||||
sync_id,
|
||||
SyncData::new(
|
||||
current_retries,
|
||||
LayersDownload {
|
||||
layers_to_skip: HashSet::from([local_timeline_path.join("layer_to_skip")]),
|
||||
},
|
||||
LayersDownload::from_skipped_layers(HashSet::from([
|
||||
local_timeline_path.join("layer_to_skip")
|
||||
])),
|
||||
),
|
||||
)
|
||||
.await
|
||||
@@ -552,12 +615,7 @@ mod tests {
|
||||
&sync_queue,
|
||||
None,
|
||||
sync_id,
|
||||
SyncData::new(
|
||||
0,
|
||||
LayersDownload {
|
||||
layers_to_skip: HashSet::new(),
|
||||
},
|
||||
),
|
||||
SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())),
|
||||
)
|
||||
.await;
|
||||
assert!(
|
||||
@@ -576,12 +634,7 @@ mod tests {
|
||||
&sync_queue,
|
||||
Some(¬_expecting_download_remote_timeline),
|
||||
sync_id,
|
||||
SyncData::new(
|
||||
0,
|
||||
LayersDownload {
|
||||
layers_to_skip: HashSet::new(),
|
||||
},
|
||||
),
|
||||
SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())),
|
||||
)
|
||||
.await;
|
||||
assert!(
|
||||
|
||||
@@ -212,8 +212,8 @@ impl RemoteTimelineIndex {
|
||||
/// Restored index part data about the timeline, stored in the remote index.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RemoteTimeline {
|
||||
timeline_layers: HashSet<PathBuf>,
|
||||
missing_layers: HashSet<PathBuf>,
|
||||
timeline_layers: HashMap<PathBuf, LayerFileMetadata>,
|
||||
missing_layers: HashMap<PathBuf, LayerFileMetadata>,
|
||||
|
||||
pub metadata: TimelineMetadata,
|
||||
pub awaits_download: bool,
|
||||
@@ -222,62 +222,161 @@ pub struct RemoteTimeline {
|
||||
impl RemoteTimeline {
|
||||
pub fn new(metadata: TimelineMetadata) -> Self {
|
||||
Self {
|
||||
timeline_layers: HashSet::new(),
|
||||
missing_layers: HashSet::new(),
|
||||
timeline_layers: HashMap::default(),
|
||||
missing_layers: HashMap::default(),
|
||||
metadata,
|
||||
awaits_download: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_timeline_layers(&mut self, new_layers: impl IntoIterator<Item = PathBuf>) {
|
||||
self.timeline_layers.extend(new_layers.into_iter());
|
||||
pub fn add_timeline_layers(
|
||||
&mut self,
|
||||
new_layers: impl IntoIterator<Item = (PathBuf, LayerFileMetadata)>,
|
||||
) {
|
||||
self.timeline_layers.extend(new_layers);
|
||||
}
|
||||
|
||||
pub fn add_upload_failures(&mut self, upload_failures: impl IntoIterator<Item = PathBuf>) {
|
||||
self.missing_layers.extend(upload_failures.into_iter());
|
||||
pub fn add_upload_failures(
|
||||
&mut self,
|
||||
upload_failures: impl IntoIterator<Item = (PathBuf, LayerFileMetadata)>,
|
||||
) {
|
||||
self.missing_layers.extend(upload_failures);
|
||||
}
|
||||
|
||||
pub fn remove_layers(&mut self, layers_to_remove: &HashSet<PathBuf>) {
|
||||
self.timeline_layers
|
||||
.retain(|layer| !layers_to_remove.contains(layer));
|
||||
.retain(|layer, _| !layers_to_remove.contains(layer));
|
||||
self.missing_layers
|
||||
.retain(|layer| !layers_to_remove.contains(layer));
|
||||
.retain(|layer, _| !layers_to_remove.contains(layer));
|
||||
}
|
||||
|
||||
/// Lists all layer files in the given remote timeline. Omits the metadata file.
|
||||
pub fn stored_files(&self) -> &HashSet<PathBuf> {
|
||||
pub fn stored_files(&self) -> &HashMap<PathBuf, LayerFileMetadata> {
|
||||
&self.timeline_layers
|
||||
}
|
||||
|
||||
/// Combines metadata gathered or verified during downloading needed layer files to metadata on
|
||||
/// the [`RemoteIndex`], so it can be uploaded later.
|
||||
pub fn merge_metadata_from_downloaded(
|
||||
&mut self,
|
||||
downloaded: &HashMap<PathBuf, LayerFileMetadata>,
|
||||
) {
|
||||
downloaded.iter().for_each(|(path, metadata)| {
|
||||
if let Some(upgraded) = self.timeline_layers.get_mut(path) {
|
||||
upgraded.merge(metadata);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
pub fn from_index_part(timeline_path: &Path, index_part: IndexPart) -> anyhow::Result<Self> {
|
||||
let metadata = TimelineMetadata::from_bytes(&index_part.metadata_bytes)?;
|
||||
let default_metadata = &IndexLayerMetadata::default();
|
||||
|
||||
let find_metadata = |key: &RelativePath| -> LayerFileMetadata {
|
||||
index_part
|
||||
.layer_metadata
|
||||
.get(key)
|
||||
.unwrap_or(default_metadata)
|
||||
.into()
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
timeline_layers: to_local_paths(timeline_path, index_part.timeline_layers),
|
||||
missing_layers: to_local_paths(timeline_path, index_part.missing_layers),
|
||||
timeline_layers: index_part
|
||||
.timeline_layers
|
||||
.iter()
|
||||
.map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path)))
|
||||
.collect(),
|
||||
missing_layers: index_part
|
||||
.missing_layers
|
||||
.iter()
|
||||
.map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path)))
|
||||
.collect(),
|
||||
metadata,
|
||||
awaits_download: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata gathered for each of the layer files.
|
||||
///
|
||||
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||
/// might have less or more metadata depending if upgrading or rolling back an upgrade.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
#[cfg_attr(test, derive(Default))]
|
||||
pub struct LayerFileMetadata {
|
||||
file_size: Option<u64>,
|
||||
}
|
||||
|
||||
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
||||
fn from(other: &IndexLayerMetadata) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size: other.file_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerFileMetadata {
|
||||
pub fn new(file_size: u64) -> Self {
|
||||
LayerFileMetadata {
|
||||
file_size: Some(file_size),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn file_size(&self) -> Option<u64> {
|
||||
self.file_size
|
||||
}
|
||||
|
||||
/// Metadata has holes due to version upgrades. This method is called to upgrade self with the
|
||||
/// other value.
|
||||
///
|
||||
/// This is called on the possibly outdated version.
|
||||
pub fn merge(&mut self, other: &Self) {
|
||||
self.file_size = other.file_size.or(self.file_size);
|
||||
}
|
||||
}
|
||||
|
||||
/// Part of the remote index, corresponding to a certain timeline.
|
||||
/// Contains the data about all files in the timeline, present remotely and its metadata.
|
||||
///
|
||||
/// This type needs to be backwards and forwards compatible. When changing the fields,
|
||||
/// remember to add a test case for the changed version.
|
||||
#[serde_as]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexPart {
|
||||
/// Debugging aid describing the version of this type.
|
||||
#[serde(default)]
|
||||
version: usize,
|
||||
|
||||
/// Each of the layers present on remote storage.
|
||||
///
|
||||
/// Additional metadata can might exist in `layer_metadata`.
|
||||
timeline_layers: HashSet<RelativePath>,
|
||||
|
||||
/// Currently is not really used in pageserver,
|
||||
/// present to manually keep track of the layer files that pageserver might never retrieve.
|
||||
///
|
||||
/// Such "holes" might appear if any upload task was evicted on an error threshold:
|
||||
/// the this layer will only be rescheduled for upload on pageserver restart.
|
||||
missing_layers: HashSet<RelativePath>,
|
||||
|
||||
/// Per layer file metadata, which can be present for a present or missing layer file.
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
/// that latest version stores.
|
||||
#[serde(default)]
|
||||
layer_metadata: HashMap<RelativePath, IndexLayerMetadata>,
|
||||
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata_bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
/// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
|
||||
/// used to understand later versions.
|
||||
///
|
||||
/// Version is currently informative only.
|
||||
const LATEST_VERSION: usize = 1;
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -288,8 +387,10 @@ impl IndexPart {
|
||||
metadata_bytes: Vec<u8>,
|
||||
) -> Self {
|
||||
Self {
|
||||
version: Self::LATEST_VERSION,
|
||||
timeline_layers,
|
||||
missing_layers,
|
||||
layer_metadata: HashMap::default(),
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
}
|
||||
@@ -304,35 +405,68 @@ impl IndexPart {
|
||||
remote_timeline: RemoteTimeline,
|
||||
) -> anyhow::Result<Self> {
|
||||
let metadata_bytes = remote_timeline.metadata.to_bytes()?;
|
||||
|
||||
let mut layer_metadata = HashMap::new();
|
||||
|
||||
let mut missing_layers = HashSet::new();
|
||||
|
||||
separate_paths_and_metadata(
|
||||
timeline_path,
|
||||
&remote_timeline.missing_layers,
|
||||
&mut missing_layers,
|
||||
&mut layer_metadata,
|
||||
)
|
||||
.context("Failed to convert missing layers' paths to relative ones")?;
|
||||
|
||||
let mut timeline_layers = HashSet::new();
|
||||
|
||||
separate_paths_and_metadata(
|
||||
timeline_path,
|
||||
&remote_timeline.timeline_layers,
|
||||
&mut timeline_layers,
|
||||
&mut layer_metadata,
|
||||
)
|
||||
.context("Failed to convert timeline layers' paths to relative ones")?;
|
||||
|
||||
Ok(Self {
|
||||
timeline_layers: to_relative_paths(timeline_path, remote_timeline.timeline_layers)
|
||||
.context("Failed to convert timeline layers' paths to relative ones")?,
|
||||
missing_layers: to_relative_paths(timeline_path, remote_timeline.missing_layers)
|
||||
.context("Failed to convert missing layers' paths to relative ones")?,
|
||||
version: Self::LATEST_VERSION,
|
||||
timeline_layers,
|
||||
missing_layers,
|
||||
layer_metadata,
|
||||
disk_consistent_lsn: remote_timeline.metadata.disk_consistent_lsn(),
|
||||
metadata_bytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn to_local_paths(
|
||||
timeline_path: &Path,
|
||||
paths: impl IntoIterator<Item = RelativePath>,
|
||||
) -> HashSet<PathBuf> {
|
||||
paths
|
||||
.into_iter()
|
||||
.map(|path| path.as_path(timeline_path))
|
||||
.collect()
|
||||
/// Serialized form of [`LayerFileMetadata`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct IndexLayerMetadata {
|
||||
file_size: Option<u64>,
|
||||
}
|
||||
|
||||
fn to_relative_paths(
|
||||
impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
|
||||
fn from(other: &'_ LayerFileMetadata) -> Self {
|
||||
IndexLayerMetadata {
|
||||
file_size: other.file_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn separate_paths_and_metadata(
|
||||
timeline_path: &Path,
|
||||
paths: impl IntoIterator<Item = PathBuf>,
|
||||
) -> anyhow::Result<HashSet<RelativePath>> {
|
||||
paths
|
||||
.into_iter()
|
||||
.map(|path| RelativePath::new(timeline_path, path))
|
||||
.collect()
|
||||
input: &HashMap<PathBuf, LayerFileMetadata>,
|
||||
output: &mut HashSet<RelativePath>,
|
||||
layer_metadata: &mut HashMap<RelativePath, IndexLayerMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
for (path, metadata) in input {
|
||||
let rel_path = RelativePath::new(timeline_path, path)?;
|
||||
let metadata = IndexLayerMetadata::from(metadata);
|
||||
|
||||
layer_metadata.insert(rel_path.clone(), metadata);
|
||||
output.insert(rel_path);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -357,13 +491,13 @@ mod tests {
|
||||
DEFAULT_PG_VERSION,
|
||||
);
|
||||
let remote_timeline = RemoteTimeline {
|
||||
timeline_layers: HashSet::from([
|
||||
timeline_path.join("layer_1"),
|
||||
timeline_path.join("layer_2"),
|
||||
timeline_layers: HashMap::from([
|
||||
(timeline_path.join("layer_1"), LayerFileMetadata::new(1)),
|
||||
(timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
|
||||
]),
|
||||
missing_layers: HashSet::from([
|
||||
timeline_path.join("missing_1"),
|
||||
timeline_path.join("missing_2"),
|
||||
missing_layers: HashMap::from([
|
||||
(timeline_path.join("missing_1"), LayerFileMetadata::new(3)),
|
||||
(timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
|
||||
]),
|
||||
metadata: metadata.clone(),
|
||||
awaits_download: false,
|
||||
@@ -485,13 +619,13 @@ mod tests {
|
||||
let conversion_result = IndexPart::from_remote_timeline(
|
||||
&timeline_path,
|
||||
RemoteTimeline {
|
||||
timeline_layers: HashSet::from([
|
||||
PathBuf::from("bad_path"),
|
||||
timeline_path.join("layer_2"),
|
||||
timeline_layers: HashMap::from([
|
||||
(PathBuf::from("bad_path"), LayerFileMetadata::new(1)),
|
||||
(timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
|
||||
]),
|
||||
missing_layers: HashSet::from([
|
||||
timeline_path.join("missing_1"),
|
||||
timeline_path.join("missing_2"),
|
||||
missing_layers: HashMap::from([
|
||||
(timeline_path.join("missing_1"), LayerFileMetadata::new(3)),
|
||||
(timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
|
||||
]),
|
||||
metadata: metadata.clone(),
|
||||
awaits_download: false,
|
||||
@@ -502,13 +636,13 @@ mod tests {
|
||||
let conversion_result = IndexPart::from_remote_timeline(
|
||||
&timeline_path,
|
||||
RemoteTimeline {
|
||||
timeline_layers: HashSet::from([
|
||||
timeline_path.join("layer_1"),
|
||||
timeline_path.join("layer_2"),
|
||||
timeline_layers: HashMap::from([
|
||||
(timeline_path.join("layer_1"), LayerFileMetadata::new(1)),
|
||||
(timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
|
||||
]),
|
||||
missing_layers: HashSet::from([
|
||||
PathBuf::from("bad_path"),
|
||||
timeline_path.join("missing_2"),
|
||||
missing_layers: HashMap::from([
|
||||
(PathBuf::from("bad_path"), LayerFileMetadata::new(3)),
|
||||
(timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
|
||||
]),
|
||||
metadata,
|
||||
awaits_download: false,
|
||||
@@ -516,4 +650,63 @@ mod tests {
|
||||
);
|
||||
assert!(conversion_result.is_err(), "Should not be able to convert metadata with missing layer paths that are not in the timeline directory");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v0_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"missing_layers":["not_a_real_layer_but_adding_coverage"],
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 0,
|
||||
timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
|
||||
missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
|
||||
layer_metadata: HashMap::default(),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v1_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"version":1,
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"missing_layers":["not_a_real_layer_but_adding_coverage"],
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
|
||||
missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
|
||||
layer_metadata: HashMap::from([
|
||||
(RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
|
||||
file_size: Some(25600000),
|
||||
}),
|
||||
(RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: Some(9007199254741001),
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,14 +69,25 @@ pub(super) async fn upload_timeline_layers<'a>(
|
||||
.map(|meta| meta.disk_consistent_lsn());
|
||||
|
||||
let already_uploaded_layers = remote_timeline
|
||||
.map(|timeline| timeline.stored_files())
|
||||
.cloned()
|
||||
.map(|timeline| {
|
||||
timeline
|
||||
.stored_files()
|
||||
.keys()
|
||||
.cloned()
|
||||
.collect::<std::collections::HashSet<_>>()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
let layers_to_upload = upload
|
||||
.layers_to_upload
|
||||
.difference(&already_uploaded_layers)
|
||||
.cloned()
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if !already_uploaded_layers.contains(k) {
|
||||
Some((k.to_owned(), v.to_owned()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if layers_to_upload.is_empty() {
|
||||
@@ -98,7 +109,7 @@ pub(super) async fn upload_timeline_layers<'a>(
|
||||
|
||||
let mut upload_tasks = layers_to_upload
|
||||
.into_iter()
|
||||
.map(|source_path| async move {
|
||||
.map(|(source_path, known_metadata)| async move {
|
||||
let source_file = match fs::File::open(&source_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to upen a source file for layer '{}'",
|
||||
@@ -109,7 +120,7 @@ pub(super) async fn upload_timeline_layers<'a>(
|
||||
Err(e) => return Err(UploadError::MissingLocalFile(source_path, e)),
|
||||
};
|
||||
|
||||
let source_size = source_file
|
||||
let fs_size = source_file
|
||||
.metadata()
|
||||
.await
|
||||
.with_context(|| {
|
||||
@@ -119,10 +130,24 @@ pub(super) async fn upload_timeline_layers<'a>(
|
||||
)
|
||||
})
|
||||
.map_err(UploadError::Other)?
|
||||
.len() as usize;
|
||||
.len();
|
||||
|
||||
// FIXME: this looks bad
|
||||
if let Some(metadata_size) = known_metadata.file_size() {
|
||||
if metadata_size != fs_size {
|
||||
return Err(UploadError::Other(anyhow::anyhow!(
|
||||
"File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
// this is a silly state we would like to avoid
|
||||
}
|
||||
|
||||
let fs_size = usize::try_from(fs_size).with_context(|| format!("File {source_path:?} size {fs_size} could not be converted to usize"))
|
||||
.map_err(UploadError::Other)?;
|
||||
|
||||
match storage
|
||||
.upload_storage_object(Box::new(source_file), source_size, &source_path)
|
||||
.upload_storage_object(Box::new(source_file), fs_size, &source_path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to upload layer file for {sync_id}"))
|
||||
{
|
||||
@@ -136,8 +161,11 @@ pub(super) async fn upload_timeline_layers<'a>(
|
||||
while let Some(upload_result) = upload_tasks.next().await {
|
||||
match upload_result {
|
||||
Ok(uploaded_path) => {
|
||||
upload.layers_to_upload.remove(&uploaded_path);
|
||||
upload.uploaded_layers.insert(uploaded_path);
|
||||
let metadata = upload
|
||||
.layers_to_upload
|
||||
.remove(&uploaded_path)
|
||||
.expect("metadata should always exist, assuming no double uploads");
|
||||
upload.uploaded_layers.insert(uploaded_path, metadata);
|
||||
}
|
||||
Err(e) => match e {
|
||||
UploadError::Other(e) => {
|
||||
@@ -262,7 +290,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
upload
|
||||
.uploaded_layers
|
||||
.iter()
|
||||
.keys()
|
||||
.cloned()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
layer_files
|
||||
@@ -357,7 +385,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
upload
|
||||
.uploaded_layers
|
||||
.iter()
|
||||
.keys()
|
||||
.cloned()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
layer_files
|
||||
|
||||
@@ -59,13 +59,14 @@ pub mod block_io;
|
||||
mod delta_layer;
|
||||
mod disk_btree;
|
||||
pub(crate) mod ephemeral_file;
|
||||
mod filename;
|
||||
pub mod filename;
|
||||
mod image_layer;
|
||||
mod inmemory_layer;
|
||||
mod layer_map;
|
||||
pub mod layer_map;
|
||||
|
||||
pub mod metadata;
|
||||
mod par_fsync;
|
||||
mod storage_layer;
|
||||
pub mod storage_layer;
|
||||
|
||||
mod timeline;
|
||||
|
||||
@@ -144,17 +145,18 @@ impl Tenant {
|
||||
|
||||
/// Lists timelines the tenant contains.
|
||||
/// Up to tenant's implementation to omit certain timelines that ar not considered ready for use.
|
||||
pub fn list_timelines(&self) -> Vec<(TimelineId, Arc<Timeline>)> {
|
||||
pub fn list_timelines(&self) -> Vec<Arc<Timeline>> {
|
||||
self.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.map(|(timeline_id, timeline_entry)| (*timeline_id, Arc::clone(timeline_entry)))
|
||||
.values()
|
||||
.map(Arc::clone)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Create a new, empty timeline. The caller is responsible for loading data into it
|
||||
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
|
||||
/// This is used to create the initial 'main' timeline during bootstrapping,
|
||||
/// or when importing a new base backup. The caller is expected to load an
|
||||
/// initial image of the datadir to the new timeline after this.
|
||||
pub fn create_empty_timeline(
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
@@ -345,7 +347,7 @@ impl Tenant {
|
||||
|
||||
ensure!(
|
||||
!children_exist,
|
||||
"Cannot detach timeline which has child timelines"
|
||||
"Cannot delete timeline which has child timelines"
|
||||
);
|
||||
let timeline_entry = match timelines.entry(timeline_id) {
|
||||
Entry::Occupied(e) => e,
|
||||
@@ -906,6 +908,7 @@ impl Tenant {
|
||||
Ok(totals)
|
||||
}
|
||||
|
||||
/// Branch an existing timeline
|
||||
fn branch_timeline(
|
||||
&self,
|
||||
src: TimelineId,
|
||||
@@ -981,7 +984,7 @@ impl Tenant {
|
||||
dst_prev,
|
||||
Some(src),
|
||||
start_lsn,
|
||||
*src_timeline.latest_gc_cutoff_lsn.read(),
|
||||
*src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
|
||||
src_timeline.initdb_lsn,
|
||||
src_timeline.pg_version,
|
||||
);
|
||||
@@ -1094,12 +1097,22 @@ impl Tenant {
|
||||
|
||||
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||
/// to get bootstrap data for timeline initialization.
|
||||
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32) -> Result<()> {
|
||||
info!("running initdb in {}... ", initdbpath.display());
|
||||
fn run_initdb(
|
||||
conf: &'static PageServerConf,
|
||||
initdb_target_dir: &Path,
|
||||
pg_version: u32,
|
||||
) -> Result<()> {
|
||||
let initdb_bin_path = conf.pg_bin_dir(pg_version).join("initdb");
|
||||
let initdb_lib_dir = conf.pg_lib_dir(pg_version);
|
||||
info!(
|
||||
"running {} in {}, libdir: {}",
|
||||
initdb_bin_path.display(),
|
||||
initdb_target_dir.display(),
|
||||
initdb_lib_dir.display(),
|
||||
);
|
||||
|
||||
let initdb_path = conf.pg_bin_dir(pg_version).join("initdb");
|
||||
let initdb_output = Command::new(initdb_path)
|
||||
.args(&["-D", &initdbpath.to_string_lossy()])
|
||||
let initdb_output = Command::new(initdb_bin_path)
|
||||
.args(&["-D", &initdb_target_dir.to_string_lossy()])
|
||||
.args(&["-U", &conf.superuser])
|
||||
.args(&["-E", "utf8"])
|
||||
.arg("--no-instructions")
|
||||
@@ -1107,8 +1120,8 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32)
|
||||
// so no need to fsync it
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
|
||||
.env("LD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.context("failed to execute initdb")?;
|
||||
|
||||
@@ -556,7 +556,7 @@ impl DeltaLayer {
|
||||
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
|
||||
/// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
|
||||
pub fn new_for_path<F>(path: &Path, file: F) -> Result<Self>
|
||||
where
|
||||
F: FileExt,
|
||||
|
||||
@@ -177,7 +177,7 @@ impl fmt::Display for ImageFileName {
|
||||
///
|
||||
/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
|
||||
/// global config, and paths to layer files are constructed using the tenant/timeline
|
||||
/// path from the config. But in the 'dump_layerfile' binary, we need to construct a Layer
|
||||
/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer
|
||||
/// struct for a file on disk, without having a page server running, so that we have no
|
||||
/// config. In that case, we use the Path variant to hold the full path to the file on
|
||||
/// disk.
|
||||
|
||||
@@ -357,7 +357,7 @@ impl ImageLayer {
|
||||
|
||||
/// Create an ImageLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
|
||||
/// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
|
||||
pub fn new_for_path<F>(path: &Path, file: F) -> Result<ImageLayer>
|
||||
where
|
||||
F: std::os::unix::prelude::FileExt,
|
||||
|
||||
@@ -15,25 +15,19 @@ use crate::repository::Key;
|
||||
use crate::tenant::inmemory_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::tenant::storage_layer::{range_eq, range_overlaps};
|
||||
use amplify_num::i256;
|
||||
use anyhow::Result;
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
use num_traits::identities::{One, Zero};
|
||||
use num_traits::{Bounded, Num, Signed};
|
||||
use rstar::{RTree, RTreeObject, AABB};
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::ops::{Add, Div, Mul, Neg, Rem, Sub};
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, PartialOrd, Ord, Eq)]
|
||||
struct BTreeKey {
|
||||
lsn: Lsn,
|
||||
seq: usize,
|
||||
}
|
||||
|
||||
impl BTreeKey {
|
||||
fn new(lsn: Lsn) -> BTreeKey {
|
||||
BTreeKey { lsn, seq: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
///
|
||||
@@ -59,14 +53,175 @@ pub struct LayerMap {
|
||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||
|
||||
/// All the historic layers are kept here
|
||||
historic_layers: BTreeMap<BTreeKey, Arc<dyn Layer>>,
|
||||
layers_seqno: usize,
|
||||
historic_layers: RTree<LayerRTreeObject>,
|
||||
|
||||
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
||||
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
||||
l0_delta_layers: Vec<Arc<dyn Layer>>,
|
||||
}
|
||||
|
||||
struct LayerRTreeObject {
|
||||
layer: Arc<dyn Layer>,
|
||||
|
||||
envelope: AABB<[IntKey; 2]>,
|
||||
}
|
||||
|
||||
// Representation of Key as numeric type.
|
||||
// We can not use native implementation of i128, because rstar::RTree
|
||||
// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi).
|
||||
// Overflow will cause panic in debug mode and incorrect area calculation in release mode,
|
||||
// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work).
|
||||
// By using i256 as the type, even though all the actual values would fit in i128, we can be
|
||||
// sure that multiplication doesn't overflow.
|
||||
//
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)]
|
||||
struct IntKey(i256);
|
||||
|
||||
impl Copy for IntKey {}
|
||||
|
||||
impl IntKey {
|
||||
fn from(i: i128) -> Self {
|
||||
IntKey(i256::from(i))
|
||||
}
|
||||
}
|
||||
|
||||
impl Bounded for IntKey {
|
||||
fn min_value() -> Self {
|
||||
IntKey(i256::MIN)
|
||||
}
|
||||
fn max_value() -> Self {
|
||||
IntKey(i256::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
impl Signed for IntKey {
|
||||
fn is_positive(&self) -> bool {
|
||||
self.0 > i256::ZERO
|
||||
}
|
||||
fn is_negative(&self) -> bool {
|
||||
self.0 < i256::ZERO
|
||||
}
|
||||
fn signum(&self) -> Self {
|
||||
match self.0.cmp(&i256::ZERO) {
|
||||
Ordering::Greater => IntKey(i256::ONE),
|
||||
Ordering::Less => IntKey(-i256::ONE),
|
||||
Ordering::Equal => IntKey(i256::ZERO),
|
||||
}
|
||||
}
|
||||
fn abs(&self) -> Self {
|
||||
IntKey(self.0.abs())
|
||||
}
|
||||
fn abs_sub(&self, other: &Self) -> Self {
|
||||
if self.0 <= other.0 {
|
||||
IntKey(i256::ZERO)
|
||||
} else {
|
||||
IntKey(self.0 - other.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Neg for IntKey {
|
||||
type Output = Self;
|
||||
fn neg(self) -> Self::Output {
|
||||
IntKey(-self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Rem for IntKey {
|
||||
type Output = Self;
|
||||
fn rem(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 % rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Div for IntKey {
|
||||
type Output = Self;
|
||||
fn div(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 / rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Add for IntKey {
|
||||
type Output = Self;
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 + rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Sub for IntKey {
|
||||
type Output = Self;
|
||||
fn sub(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 - rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Mul for IntKey {
|
||||
type Output = Self;
|
||||
fn mul(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 * rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl One for IntKey {
|
||||
fn one() -> Self {
|
||||
IntKey(i256::ONE)
|
||||
}
|
||||
}
|
||||
|
||||
impl Zero for IntKey {
|
||||
fn zero() -> Self {
|
||||
IntKey(i256::ZERO)
|
||||
}
|
||||
fn is_zero(&self) -> bool {
|
||||
self.0 == i256::ZERO
|
||||
}
|
||||
}
|
||||
|
||||
impl Num for IntKey {
|
||||
type FromStrRadixErr = <i128 as Num>::FromStrRadixErr;
|
||||
fn from_str_radix(str: &str, radix: u32) -> Result<Self, Self::FromStrRadixErr> {
|
||||
Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?)))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for LayerRTreeObject {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
// FIXME: ptr_eq might fail to return true for 'dyn'
|
||||
// references. Clippy complains about this. In practice it
|
||||
// seems to work, the assertion below would be triggered
|
||||
// otherwise but this ought to be fixed.
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
Arc::ptr_eq(&self.layer, &other.layer)
|
||||
}
|
||||
}
|
||||
|
||||
impl RTreeObject for LayerRTreeObject {
|
||||
type Envelope = AABB<[IntKey; 2]>;
|
||||
fn envelope(&self) -> Self::Envelope {
|
||||
self.envelope
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerRTreeObject {
|
||||
fn new(layer: Arc<dyn Layer>) -> Self {
|
||||
let key_range = layer.get_key_range();
|
||||
let lsn_range = layer.get_lsn_range();
|
||||
|
||||
let envelope = AABB::from_corners(
|
||||
[
|
||||
IntKey::from(key_range.start.to_i128()),
|
||||
IntKey::from(lsn_range.start.0 as i128),
|
||||
],
|
||||
[
|
||||
IntKey::from(key_range.end.to_i128() - 1),
|
||||
IntKey::from(lsn_range.end.0 as i128 - 1),
|
||||
], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive
|
||||
);
|
||||
LayerRTreeObject { layer, envelope }
|
||||
}
|
||||
}
|
||||
|
||||
/// Return value of LayerMap::search
|
||||
pub struct SearchResult {
|
||||
pub layer: Arc<dyn Layer>,
|
||||
@@ -89,17 +244,23 @@ impl LayerMap {
|
||||
// linear search
|
||||
// Find the latest image layer that covers the given key
|
||||
let mut latest_img: Option<Arc<dyn Layer>> = None;
|
||||
let mut latest_img_lsn = Lsn(0);
|
||||
let mut iter = self
|
||||
let mut latest_img_lsn: Option<Lsn> = None;
|
||||
let envelope = AABB::from_corners(
|
||||
[IntKey::from(key.to_i128()), IntKey::from(0i128)],
|
||||
[
|
||||
IntKey::from(key.to_i128()),
|
||||
IntKey::from(end_lsn.0 as i128 - 1),
|
||||
],
|
||||
);
|
||||
for e in self
|
||||
.historic_layers
|
||||
.range(BTreeKey::new(Lsn(0))..BTreeKey::new(end_lsn));
|
||||
while let Some((_key, l)) = iter.next_back() {
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
assert!(l.get_key_range().contains(&key));
|
||||
let img_lsn = l.get_lsn_range().start;
|
||||
assert!(img_lsn < end_lsn);
|
||||
if Lsn(img_lsn.0 + 1) == end_lsn {
|
||||
@@ -109,23 +270,23 @@ impl LayerMap {
|
||||
lsn_floor: img_lsn,
|
||||
}));
|
||||
}
|
||||
latest_img = Some(Arc::clone(l));
|
||||
latest_img_lsn = img_lsn;
|
||||
break;
|
||||
if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
|
||||
latest_img = Some(Arc::clone(l));
|
||||
latest_img_lsn = Some(img_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
// Search the delta layers
|
||||
let mut latest_delta: Option<Arc<dyn Layer>> = None;
|
||||
let mut iter = self
|
||||
for e in self
|
||||
.historic_layers
|
||||
.range(BTreeKey::new(Lsn(0))..BTreeKey::new(end_lsn));
|
||||
while let Some((_key, l)) = iter.next_back() {
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
assert!(l.get_key_range().contains(&key));
|
||||
if l.get_lsn_range().start >= end_lsn {
|
||||
info!(
|
||||
"Candidate delta layer {}..{} is too new for lsn {}",
|
||||
@@ -135,9 +296,6 @@ impl LayerMap {
|
||||
);
|
||||
}
|
||||
assert!(l.get_lsn_range().start < end_lsn);
|
||||
if l.get_lsn_range().end <= latest_img_lsn {
|
||||
continue;
|
||||
}
|
||||
if l.get_lsn_range().end >= end_lsn {
|
||||
// this layer contains the requested point in the key/lsn space.
|
||||
// No need to search any further
|
||||
@@ -163,7 +321,10 @@ impl LayerMap {
|
||||
"found (old) layer {} for request on {key} at {end_lsn}",
|
||||
l.filename().display(),
|
||||
);
|
||||
let lsn_floor = std::cmp::max(Lsn(latest_img_lsn.0 + 1), l.get_lsn_range().start);
|
||||
let lsn_floor = std::cmp::max(
|
||||
Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
|
||||
l.get_lsn_range().start,
|
||||
);
|
||||
Ok(Some(SearchResult {
|
||||
lsn_floor,
|
||||
layer: l,
|
||||
@@ -171,7 +332,7 @@ impl LayerMap {
|
||||
} else if let Some(l) = latest_img {
|
||||
trace!("found img layer and no deltas for request on {key} at {end_lsn}");
|
||||
Ok(Some(SearchResult {
|
||||
lsn_floor: latest_img_lsn,
|
||||
lsn_floor: latest_img_lsn.unwrap(),
|
||||
layer: l,
|
||||
}))
|
||||
} else {
|
||||
@@ -187,14 +348,7 @@ impl LayerMap {
|
||||
if layer.get_key_range() == (Key::MIN..Key::MAX) {
|
||||
self.l0_delta_layers.push(layer.clone());
|
||||
}
|
||||
self.historic_layers.insert(
|
||||
BTreeKey {
|
||||
lsn: layer.get_lsn_range().start,
|
||||
seq: self.layers_seqno,
|
||||
},
|
||||
layer,
|
||||
);
|
||||
self.layers_seqno += 1;
|
||||
self.historic_layers.insert(LayerRTreeObject::new(layer));
|
||||
NUM_ONDISK_LAYERS.inc();
|
||||
}
|
||||
|
||||
@@ -216,26 +370,10 @@ impl LayerMap {
|
||||
.retain(|other| !Arc::ptr_eq(other, &layer));
|
||||
assert_eq!(self.l0_delta_layers.len(), len_before - 1);
|
||||
}
|
||||
let len_before = self.historic_layers.len();
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
self.historic_layers
|
||||
.retain(|_key, other| !Arc::ptr_eq(other, &layer));
|
||||
if self.historic_layers.len() != len_before - 1 {
|
||||
assert!(self.historic_layers.len() == len_before);
|
||||
error!(
|
||||
"Failed to remove {} layer: {}..{}__{}..{}",
|
||||
if layer.is_incremental() {
|
||||
"inremental"
|
||||
} else {
|
||||
"image"
|
||||
},
|
||||
layer.get_key_range().start,
|
||||
layer.get_key_range().end,
|
||||
layer.get_lsn_range().start,
|
||||
layer.get_lsn_range().end
|
||||
);
|
||||
}
|
||||
assert!(self.historic_layers.len() == len_before - 1);
|
||||
assert!(self
|
||||
.historic_layers
|
||||
.remove(&LayerRTreeObject::new(layer))
|
||||
.is_some());
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
}
|
||||
|
||||
@@ -252,10 +390,21 @@ impl LayerMap {
|
||||
|
||||
loop {
|
||||
let mut made_progress = false;
|
||||
for (_key, l) in self
|
||||
let envelope = AABB::from_corners(
|
||||
[
|
||||
IntKey::from(range_remain.start.to_i128()),
|
||||
IntKey::from(lsn_range.start.0 as i128),
|
||||
],
|
||||
[
|
||||
IntKey::from(range_remain.end.to_i128() - 1),
|
||||
IntKey::from(lsn_range.end.0 as i128 - 1),
|
||||
],
|
||||
);
|
||||
for e in self
|
||||
.historic_layers
|
||||
.range(BTreeKey::new(lsn_range.start)..BTreeKey::new(lsn_range.end))
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
@@ -278,30 +427,39 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<dyn Layer>> {
|
||||
self.historic_layers
|
||||
.iter()
|
||||
.map(|(_key, layer)| layer.clone())
|
||||
self.historic_layers.iter().map(|e| e.layer.clone())
|
||||
}
|
||||
|
||||
/// Find the last image layer that covers 'key', ignoring any image layers
|
||||
/// newer than 'lsn'.
|
||||
fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
let mut iter = self
|
||||
let mut candidate_lsn = Lsn(0);
|
||||
let mut candidate = None;
|
||||
let envelope = AABB::from_corners(
|
||||
[IntKey::from(key.to_i128()), IntKey::from(0)],
|
||||
[IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)],
|
||||
);
|
||||
for e in self
|
||||
.historic_layers
|
||||
.range(BTreeKey::new(Lsn(0))..BTreeKey::new(lsn + 1));
|
||||
while let Some((_key, l)) = iter.next_back() {
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
assert!(l.get_key_range().contains(&key));
|
||||
let this_lsn = l.get_lsn_range().start;
|
||||
assert!(this_lsn <= lsn);
|
||||
return Some(Arc::clone(l));
|
||||
if this_lsn < candidate_lsn {
|
||||
// our previous candidate was better
|
||||
continue;
|
||||
}
|
||||
candidate_lsn = this_lsn;
|
||||
candidate = Some(Arc::clone(l));
|
||||
}
|
||||
None
|
||||
|
||||
candidate
|
||||
}
|
||||
|
||||
///
|
||||
@@ -318,10 +476,18 @@ impl LayerMap {
|
||||
lsn: Lsn,
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
|
||||
let mut points = vec![key_range.start];
|
||||
for (_lsn, l) in self
|
||||
let envelope = AABB::from_corners(
|
||||
[IntKey::from(key_range.start.to_i128()), IntKey::from(0)],
|
||||
[
|
||||
IntKey::from(key_range.end.to_i128()),
|
||||
IntKey::from(lsn.0 as i128),
|
||||
],
|
||||
);
|
||||
for e in self
|
||||
.historic_layers
|
||||
.range(BTreeKey::new(Lsn(0))..BTreeKey::new(lsn + 1))
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
assert!(l.get_lsn_range().start <= lsn);
|
||||
let range = l.get_key_range();
|
||||
if key_range.contains(&range.start) {
|
||||
@@ -358,17 +524,26 @@ impl LayerMap {
|
||||
if lsn_range.start >= lsn_range.end {
|
||||
return Ok(0);
|
||||
}
|
||||
for (_lsn, l) in self
|
||||
let envelope = AABB::from_corners(
|
||||
[
|
||||
IntKey::from(key_range.start.to_i128()),
|
||||
IntKey::from(lsn_range.start.0 as i128),
|
||||
],
|
||||
[
|
||||
IntKey::from(key_range.end.to_i128() - 1),
|
||||
IntKey::from(lsn_range.end.0 as i128 - 1),
|
||||
],
|
||||
);
|
||||
for e in self
|
||||
.historic_layers
|
||||
.range(BTreeKey::new(lsn_range.start)..BTreeKey::new(lsn_range.end))
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !range_overlaps(&l.get_key_range(), key_range) {
|
||||
continue;
|
||||
}
|
||||
assert!(range_overlaps(&l.get_lsn_range(), lsn_range));
|
||||
assert!(range_overlaps(&l.get_key_range(), key_range));
|
||||
|
||||
// We ignore level0 delta layers. Unless the whole keyspace fits
|
||||
// into one partition
|
||||
@@ -404,8 +579,8 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
println!("historic_layers:");
|
||||
for (_key, layer) in self.historic_layers.iter() {
|
||||
layer.dump(verbose)?;
|
||||
for e in self.historic_layers.iter() {
|
||||
e.layer.dump(verbose)?;
|
||||
}
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
|
||||
@@ -52,7 +52,10 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task};
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{page_cache, storage_sync};
|
||||
use crate::{
|
||||
page_cache,
|
||||
storage_sync::{self, index::LayerFileMetadata},
|
||||
};
|
||||
|
||||
pub struct Timeline {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -475,10 +478,6 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Mutate the timeline with a [`TimelineWriter`].
|
||||
///
|
||||
/// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter
|
||||
/// is a generic type in this trait. But that doesn't currently work in
|
||||
/// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html
|
||||
pub fn writer(&self) -> TimelineWriter<'_> {
|
||||
TimelineWriter {
|
||||
tl: self,
|
||||
@@ -1194,8 +1193,8 @@ impl Timeline {
|
||||
self.create_image_layers(&partitioning, self.initdb_lsn, true)?
|
||||
} else {
|
||||
// normal case, write out a L0 delta layer file.
|
||||
let delta_path = self.create_delta_layer(&frozen_layer)?;
|
||||
HashSet::from([delta_path])
|
||||
let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
|
||||
HashMap::from([(delta_path, metadata)])
|
||||
};
|
||||
|
||||
fail_point!("flush-frozen-before-sync");
|
||||
@@ -1221,85 +1220,86 @@ impl Timeline {
|
||||
// TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
|
||||
// *all* the layers, to avoid fsyncing the file multiple times.
|
||||
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
||||
self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?;
|
||||
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||
|
||||
// If we were able to advance 'disk_consistent_lsn', save it the metadata file.
|
||||
// After crash, we will restart WAL streaming and processing from that point.
|
||||
if disk_consistent_lsn != old_disk_consistent_lsn {
|
||||
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
|
||||
self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload)?;
|
||||
// Also update the in-memory copy
|
||||
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update metadata file
|
||||
fn update_disk_consistent_lsn(
|
||||
fn update_metadata_file(
|
||||
&self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
layer_paths_to_upload: HashSet<PathBuf>,
|
||||
layer_paths_to_upload: HashMap<PathBuf, LayerFileMetadata>,
|
||||
) -> Result<()> {
|
||||
// If we were able to advance 'disk_consistent_lsn', save it the metadata file.
|
||||
// After crash, we will restart WAL streaming and processing from that point.
|
||||
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||
if disk_consistent_lsn != old_disk_consistent_lsn {
|
||||
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
|
||||
// We can only save a valid 'prev_record_lsn' value on disk if we
|
||||
// flushed *all* in-memory changes to disk. We only track
|
||||
// 'prev_record_lsn' in memory for the latest processed record, so we
|
||||
// don't remember what the correct value that corresponds to some old
|
||||
// LSN is. But if we flush everything, then the value corresponding
|
||||
// current 'last_record_lsn' is correct and we can store it on disk.
|
||||
let RecordLsn {
|
||||
last: last_record_lsn,
|
||||
prev: prev_record_lsn,
|
||||
} = self.last_record_lsn.load();
|
||||
let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn {
|
||||
Some(prev_record_lsn)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// We can only save a valid 'prev_record_lsn' value on disk if we
|
||||
// flushed *all* in-memory changes to disk. We only track
|
||||
// 'prev_record_lsn' in memory for the latest processed record, so we
|
||||
// don't remember what the correct value that corresponds to some old
|
||||
// LSN is. But if we flush everything, then the value corresponding
|
||||
// current 'last_record_lsn' is correct and we can store it on disk.
|
||||
let RecordLsn {
|
||||
last: last_record_lsn,
|
||||
prev: prev_record_lsn,
|
||||
} = self.last_record_lsn.load();
|
||||
let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn {
|
||||
Some(prev_record_lsn)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let ancestor_timeline_id = self
|
||||
.ancestor_timeline
|
||||
.as_ref()
|
||||
.map(|ancestor| ancestor.timeline_id);
|
||||
|
||||
let ancestor_timeline_id = self
|
||||
.ancestor_timeline
|
||||
.as_ref()
|
||||
.map(|ancestor| ancestor.timeline_id);
|
||||
let metadata = TimelineMetadata::new(
|
||||
disk_consistent_lsn,
|
||||
ondisk_prev_record_lsn,
|
||||
ancestor_timeline_id,
|
||||
self.ancestor_lsn,
|
||||
*self.latest_gc_cutoff_lsn.read(),
|
||||
self.initdb_lsn,
|
||||
self.pg_version,
|
||||
);
|
||||
|
||||
let metadata = TimelineMetadata::new(
|
||||
disk_consistent_lsn,
|
||||
ondisk_prev_record_lsn,
|
||||
ancestor_timeline_id,
|
||||
self.ancestor_lsn,
|
||||
*self.latest_gc_cutoff_lsn.read(),
|
||||
self.initdb_lsn,
|
||||
self.pg_version,
|
||||
);
|
||||
fail_point!("checkpoint-before-saving-metadata", |x| bail!(
|
||||
"{}",
|
||||
x.unwrap()
|
||||
));
|
||||
|
||||
fail_point!("checkpoint-before-saving-metadata", |x| bail!(
|
||||
"{}",
|
||||
x.unwrap()
|
||||
));
|
||||
save_metadata(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&metadata,
|
||||
false,
|
||||
)?;
|
||||
|
||||
save_metadata(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
&metadata,
|
||||
false,
|
||||
)?;
|
||||
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
layer_paths_to_upload,
|
||||
Some(metadata),
|
||||
);
|
||||
}
|
||||
|
||||
// Also update the in-memory copy
|
||||
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
||||
self.timeline_id,
|
||||
layer_paths_to_upload,
|
||||
Some(metadata),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Write out the given frozen in-memory layer as a new L0 delta file
|
||||
fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result<PathBuf> {
|
||||
fn create_delta_layer(
|
||||
&self,
|
||||
frozen_layer: &InMemoryLayer,
|
||||
) -> Result<(PathBuf, LayerFileMetadata)> {
|
||||
// Write it out
|
||||
let new_delta = frozen_layer.write_to_disk()?;
|
||||
let new_delta_path = new_delta.path();
|
||||
@@ -1325,12 +1325,13 @@ impl Timeline {
|
||||
|
||||
// update the timeline's physical size
|
||||
let sz = new_delta_path.metadata()?.len();
|
||||
|
||||
self.metrics.current_physical_size_gauge.add(sz);
|
||||
// update metrics
|
||||
self.metrics.num_persistent_files_created.inc_by(1);
|
||||
self.metrics.persistent_bytes_written.inc_by(sz);
|
||||
|
||||
Ok(new_delta_path)
|
||||
Ok((new_delta_path, LayerFileMetadata::new(sz)))
|
||||
}
|
||||
|
||||
pub fn compact(&self) -> anyhow::Result<()> {
|
||||
@@ -1396,7 +1397,7 @@ impl Timeline {
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
HashSet::from_iter(layer_paths_to_upload),
|
||||
layer_paths_to_upload,
|
||||
None,
|
||||
);
|
||||
}
|
||||
@@ -1477,10 +1478,9 @@ impl Timeline {
|
||||
partitioning: &KeyPartitioning,
|
||||
lsn: Lsn,
|
||||
force: bool,
|
||||
) -> Result<HashSet<PathBuf>> {
|
||||
) -> Result<HashMap<PathBuf, LayerFileMetadata>> {
|
||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||
let mut image_layers: Vec<ImageLayer> = Vec::new();
|
||||
let mut layer_paths_to_upload = HashSet::new();
|
||||
for partition in partitioning.parts.iter() {
|
||||
if force || self.time_for_new_image_layer(partition, lsn)? {
|
||||
let img_range =
|
||||
@@ -1502,7 +1502,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
let image_layer = image_layer_writer.finish()?;
|
||||
layer_paths_to_upload.insert(image_layer.path());
|
||||
image_layers.push(image_layer);
|
||||
}
|
||||
}
|
||||
@@ -1516,15 +1515,25 @@ impl Timeline {
|
||||
//
|
||||
// Compaction creates multiple image layers. It would be better to create them all
|
||||
// and fsync them all in parallel.
|
||||
let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone());
|
||||
all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
|
||||
let all_paths = image_layers
|
||||
.iter()
|
||||
.map(|layer| layer.path())
|
||||
.chain(std::iter::once(
|
||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
))
|
||||
.collect::<Vec<_>>();
|
||||
par_fsync::par_fsync(&all_paths)?;
|
||||
|
||||
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
|
||||
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
for l in image_layers {
|
||||
self.metrics
|
||||
.current_physical_size_gauge
|
||||
.add(l.path().metadata()?.len());
|
||||
let path = l.path();
|
||||
let metadata = path.metadata()?;
|
||||
|
||||
layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
|
||||
|
||||
self.metrics.current_physical_size_gauge.add(metadata.len());
|
||||
layers.insert_historic(Arc::new(l));
|
||||
}
|
||||
drop(layers);
|
||||
@@ -1775,16 +1784,16 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let mut new_layer_paths = HashSet::with_capacity(new_layers.len());
|
||||
let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
|
||||
for l in new_layers {
|
||||
let new_delta_path = l.path();
|
||||
|
||||
// update the timeline's physical size
|
||||
self.metrics
|
||||
.current_physical_size_gauge
|
||||
.add(new_delta_path.metadata()?.len());
|
||||
let metadata = new_delta_path.metadata()?;
|
||||
|
||||
new_layer_paths.insert(new_delta_path);
|
||||
// update the timeline's physical size
|
||||
self.metrics.current_physical_size_gauge.add(metadata.len());
|
||||
|
||||
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
|
||||
layers.insert_historic(Arc::new(l));
|
||||
}
|
||||
|
||||
@@ -1950,6 +1959,9 @@ impl Timeline {
|
||||
new_gc_cutoff
|
||||
);
|
||||
write_guard.store_and_unlock(new_gc_cutoff).wait();
|
||||
|
||||
// Persist metadata file
|
||||
self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
|
||||
}
|
||||
|
||||
info!("GC starting");
|
||||
@@ -2076,6 +2088,18 @@ impl Timeline {
|
||||
result.layers_removed += 1;
|
||||
}
|
||||
|
||||
info!(
|
||||
"GC completed removing {} layers, cuttof {}",
|
||||
result.layers_removed, new_gc_cutoff
|
||||
);
|
||||
if result.layers_removed != 0 {
|
||||
fail_point!("gc-before-save-metadata", |_| {
|
||||
info!("Abnormaly terinate pageserver at gc-before-save-metadata fail point");
|
||||
std::process::abort();
|
||||
});
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
storage_sync::schedule_layer_delete(
|
||||
self.tenant_id,
|
||||
|
||||
@@ -24,7 +24,7 @@ pub mod defaults {
|
||||
// This parameter determines L1 layer file size.
|
||||
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
||||
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s";
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
|
||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//! This module acts as a switchboard to access different repositories managed by this
|
||||
//! page server.
|
||||
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::collections::{hash_map, HashMap};
|
||||
use std::ffi::OsStr;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -14,15 +14,15 @@ use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use crate::config::{PageServerConf, METADATA_FILE_NAME};
|
||||
use crate::http::models::TenantInfo;
|
||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex};
|
||||
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
|
||||
use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex};
|
||||
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::{
|
||||
ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState,
|
||||
};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX};
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
|
||||
use utils::crashsafe_dir::{self, path_with_suffix_extension};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -70,34 +70,54 @@ pub fn init_tenant_mgr(
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.expect("remote storage without config");
|
||||
|
||||
let mut broken_tenants = HashMap::new();
|
||||
let mut ready_tenants = HashMap::new();
|
||||
for (tenant_id, tenant_attach_data) in local_tenant_files.into_iter() {
|
||||
match tenant_attach_data {
|
||||
TenantAttachData::Ready(t) => {
|
||||
ready_tenants.insert(tenant_id, t);
|
||||
}
|
||||
TenantAttachData::Broken(e) => {
|
||||
broken_tenants.insert(tenant_id, TenantAttachData::Broken(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
let SyncStartupData {
|
||||
remote_index,
|
||||
local_timeline_init_statuses,
|
||||
} = storage_sync::spawn_storage_sync_task(
|
||||
conf,
|
||||
local_tenant_files,
|
||||
ready_tenants,
|
||||
storage,
|
||||
storage_config.max_concurrent_syncs,
|
||||
storage_config.max_sync_errors,
|
||||
)
|
||||
.context("Failed to spawn the storage sync thread")?;
|
||||
|
||||
(
|
||||
remote_index,
|
||||
local_timeline_init_statuses.filter_map(|init_status| match init_status {
|
||||
LocalTimelineInitStatus::LocallyComplete(metadata) => Some(metadata),
|
||||
LocalTimelineInitStatus::NeedsSync => None,
|
||||
}),
|
||||
)
|
||||
let n = local_timeline_init_statuses.0.len();
|
||||
let mut synced_timelines = local_timeline_init_statuses.0.into_iter().fold(
|
||||
HashMap::<TenantId, TenantAttachData>::with_capacity(n),
|
||||
|mut new_values, (tenant_id, old_values)| {
|
||||
let new_timeline_values = new_values
|
||||
.entry(tenant_id)
|
||||
.or_insert_with(|| TenantAttachData::Ready(HashMap::new()));
|
||||
if let TenantAttachData::Ready(t) = new_timeline_values {
|
||||
for (timeline_id, old_value) in old_values {
|
||||
if let LocalTimelineInitStatus::LocallyComplete(metadata) = old_value {
|
||||
t.insert(timeline_id, TimelineLocalFiles::ready(metadata));
|
||||
}
|
||||
}
|
||||
}
|
||||
new_values
|
||||
},
|
||||
);
|
||||
synced_timelines.extend(broken_tenants);
|
||||
|
||||
(remote_index, synced_timelines)
|
||||
} else {
|
||||
info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
|
||||
(
|
||||
RemoteIndex::default(),
|
||||
local_tenant_files.filter_map(|(metadata, _)| Some(metadata)),
|
||||
)
|
||||
(RemoteIndex::default(), local_tenant_files)
|
||||
};
|
||||
|
||||
attach_local_tenants(conf, &remote_index, tenants_to_attach);
|
||||
|
||||
Ok(remote_index)
|
||||
@@ -117,18 +137,12 @@ pub fn init_tenant_mgr(
|
||||
pub fn attach_local_tenants(
|
||||
conf: &'static PageServerConf,
|
||||
remote_index: &RemoteIndex,
|
||||
tenants_to_attach: TenantTimelineValues<TimelineMetadata>,
|
||||
tenants_to_attach: HashMap<TenantId, TenantAttachData>,
|
||||
) {
|
||||
let _entered = info_span!("attach_local_tenants").entered();
|
||||
let number_of_tenants = tenants_to_attach.0.len();
|
||||
|
||||
for (tenant_id, local_timelines) in tenants_to_attach.0 {
|
||||
info!(
|
||||
"Attaching {} timelines for {tenant_id}",
|
||||
local_timelines.len()
|
||||
);
|
||||
debug!("Timelines to attach: {local_timelines:?}");
|
||||
let number_of_tenants = tenants_to_attach.len();
|
||||
|
||||
for (tenant_id, local_timelines) in tenants_to_attach {
|
||||
let mut tenants_accessor = tenants_state::write_tenants();
|
||||
let tenant = match tenants_accessor.entry(tenant_id) {
|
||||
hash_map::Entry::Occupied(o) => {
|
||||
@@ -137,25 +151,55 @@ pub fn attach_local_tenants(
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
info!("Tenant {tenant_id} was not found in pageserver's memory, loading it");
|
||||
let tenant = load_local_tenant(conf, tenant_id, remote_index);
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
conf,
|
||||
TenantConfOpt::default(),
|
||||
Arc::new(PostgresRedoManager::new(conf, tenant_id)),
|
||||
tenant_id,
|
||||
remote_index.clone(),
|
||||
conf.remote_storage_config.is_some(),
|
||||
));
|
||||
match local_timelines {
|
||||
TenantAttachData::Broken(_) => {
|
||||
tenant.set_state(TenantState::Broken);
|
||||
}
|
||||
TenantAttachData::Ready(_) => {
|
||||
match Tenant::load_tenant_config(conf, tenant_id) {
|
||||
Ok(tenant_conf) => {
|
||||
tenant.update_tenant_config(tenant_conf);
|
||||
tenant.activate(false);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}");
|
||||
tenant.set_state(TenantState::Broken);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
v.insert(Arc::clone(&tenant));
|
||||
tenant
|
||||
}
|
||||
};
|
||||
drop(tenants_accessor);
|
||||
|
||||
if tenant.current_state() == TenantState::Broken {
|
||||
warn!("Skipping timeline load for broken tenant {tenant_id}")
|
||||
} else {
|
||||
let has_timelines = !local_timelines.is_empty();
|
||||
match tenant.init_attach_timelines(local_timelines) {
|
||||
Ok(()) => {
|
||||
info!("successfully loaded local timelines for tenant {tenant_id}");
|
||||
tenant.activate(has_timelines);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to attach tenant timelines: {e:?}");
|
||||
tenant.set_state(TenantState::Broken);
|
||||
match local_timelines {
|
||||
TenantAttachData::Broken(e) => warn!("{}", e),
|
||||
TenantAttachData::Ready(ref timelines) => {
|
||||
info!("Attaching {} timelines for {tenant_id}", timelines.len());
|
||||
debug!("Timelines to attach: {local_timelines:?}");
|
||||
let has_timelines = !timelines.is_empty();
|
||||
let timelines_to_attach = timelines
|
||||
.iter()
|
||||
.map(|(&k, v)| (k, v.metadata().to_owned()))
|
||||
.collect();
|
||||
match tenant.init_attach_timelines(timelines_to_attach) {
|
||||
Ok(()) => {
|
||||
info!("successfully loaded local timelines for tenant {tenant_id}");
|
||||
tenant.activate(has_timelines);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to attach tenant timelines: {e:?}");
|
||||
tenant.set_state(TenantState::Broken);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -164,44 +208,6 @@ pub fn attach_local_tenants(
|
||||
info!("Processed {number_of_tenants} local tenants during attach")
|
||||
}
|
||||
|
||||
fn load_local_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
remote_index: &RemoteIndex,
|
||||
) -> Arc<Tenant> {
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
conf,
|
||||
TenantConfOpt::default(),
|
||||
Arc::new(PostgresRedoManager::new(conf, tenant_id)),
|
||||
tenant_id,
|
||||
remote_index.clone(),
|
||||
conf.remote_storage_config.is_some(),
|
||||
));
|
||||
|
||||
let tenant_timelines_dir = conf.timelines_path(&tenant_id);
|
||||
if !tenant_timelines_dir.is_dir() {
|
||||
error!(
|
||||
"Tenant {} has no timelines directory at {}",
|
||||
tenant_id,
|
||||
tenant_timelines_dir.display()
|
||||
);
|
||||
tenant.set_state(TenantState::Broken);
|
||||
} else {
|
||||
match Tenant::load_tenant_config(conf, tenant_id) {
|
||||
Ok(tenant_conf) => {
|
||||
tenant.update_tenant_config(tenant_conf);
|
||||
tenant.activate(false);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}");
|
||||
tenant.set_state(TenantState::Broken);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tenant
|
||||
}
|
||||
|
||||
///
|
||||
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
||||
///
|
||||
@@ -475,16 +481,21 @@ pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec<TenantInfo> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TenantAttachData {
|
||||
Ready(HashMap<TimelineId, TimelineLocalFiles>),
|
||||
Broken(anyhow::Error),
|
||||
}
|
||||
/// Attempts to collect information about all tenant and timelines, existing on the local FS.
|
||||
/// If finds any, deletes all temporary files and directories, created before. Also removes empty directories,
|
||||
/// that may appear due to such removals.
|
||||
/// Does not fail on particular timeline or tenant collection errors, rather logging them and ignoring the entities.
|
||||
fn local_tenant_timeline_files(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<TenantTimelineValues<(TimelineMetadata, HashSet<PathBuf>)>> {
|
||||
) -> anyhow::Result<HashMap<TenantId, TenantAttachData>> {
|
||||
let _entered = info_span!("local_tenant_timeline_files").entered();
|
||||
|
||||
let mut local_tenant_timeline_files = TenantTimelineValues::new();
|
||||
let mut local_tenant_timeline_files = HashMap::new();
|
||||
let tenants_dir = config.tenants_path();
|
||||
for tenants_dir_entry in fs::read_dir(&tenants_dir)
|
||||
.with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
|
||||
@@ -506,19 +517,31 @@ fn local_tenant_timeline_files(
|
||||
}
|
||||
} else {
|
||||
match collect_timelines_for_tenant(config, &tenant_dir_path) {
|
||||
Ok((tenant_id, collected_files)) => {
|
||||
Ok((tenant_id, TenantAttachData::Broken(e))) => {
|
||||
local_tenant_timeline_files.entry(tenant_id).or_insert(TenantAttachData::Broken(e));
|
||||
},
|
||||
Ok((tenant_id, TenantAttachData::Ready(collected_files))) => {
|
||||
if collected_files.is_empty() {
|
||||
match remove_if_empty(&tenant_dir_path) {
|
||||
Ok(true) => info!("Removed empty tenant directory {}", tenant_dir_path.display()),
|
||||
Ok(false) => {
|
||||
// insert empty timeline entry: it has some non-temporary files inside that we cannot remove
|
||||
// so make obvious for HTTP API callers, that something exists there and try to load the tenant
|
||||
let _ = local_tenant_timeline_files.0.entry(tenant_id).or_default();
|
||||
let _ = local_tenant_timeline_files.entry(tenant_id).or_insert_with(|| TenantAttachData::Ready(HashMap::new()));
|
||||
},
|
||||
Err(e) => error!("Failed to remove empty tenant directory: {e:?}"),
|
||||
}
|
||||
} else {
|
||||
local_tenant_timeline_files.0.entry(tenant_id).or_default().extend(collected_files.into_iter())
|
||||
match local_tenant_timeline_files.entry(tenant_id) {
|
||||
hash_map::Entry::Vacant(entry) => {
|
||||
entry.insert(TenantAttachData::Ready(collected_files));
|
||||
}
|
||||
hash_map::Entry::Occupied(entry) =>{
|
||||
if let TenantAttachData::Ready(old_timelines) = entry.into_mut() {
|
||||
old_timelines.extend(collected_files);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
Err(e) => error!(
|
||||
@@ -541,7 +564,7 @@ fn local_tenant_timeline_files(
|
||||
|
||||
info!(
|
||||
"Collected files for {} tenants",
|
||||
local_tenant_timeline_files.0.len()
|
||||
local_tenant_timeline_files.len(),
|
||||
);
|
||||
Ok(local_tenant_timeline_files)
|
||||
}
|
||||
@@ -579,14 +602,10 @@ fn is_temporary(path: &Path) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn collect_timelines_for_tenant(
|
||||
config: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
) -> anyhow::Result<(
|
||||
TenantId,
|
||||
HashMap<TimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
|
||||
)> {
|
||||
) -> anyhow::Result<(TenantId, TenantAttachData)> {
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
@@ -595,6 +614,17 @@ fn collect_timelines_for_tenant(
|
||||
.context("Could not parse tenant id out of the tenant dir name")?;
|
||||
let timelines_dir = config.timelines_path(&tenant_id);
|
||||
|
||||
if !timelines_dir.as_path().is_dir() {
|
||||
return Ok((
|
||||
tenant_id,
|
||||
TenantAttachData::Broken(anyhow::anyhow!(
|
||||
"Tenant {} has no timelines directory at {}",
|
||||
tenant_id,
|
||||
timelines_dir.display()
|
||||
)),
|
||||
));
|
||||
}
|
||||
|
||||
let mut tenant_timelines = HashMap::new();
|
||||
for timelines_dir_entry in fs::read_dir(&timelines_dir)
|
||||
.with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))?
|
||||
@@ -617,7 +647,10 @@ fn collect_timelines_for_tenant(
|
||||
} else {
|
||||
match collect_timeline_files(&timeline_dir) {
|
||||
Ok((timeline_id, metadata, timeline_files)) => {
|
||||
tenant_timelines.insert(timeline_id, (metadata, timeline_files));
|
||||
tenant_timelines.insert(
|
||||
timeline_id,
|
||||
TimelineLocalFiles::collected(metadata, timeline_files),
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
@@ -652,15 +685,19 @@ fn collect_timelines_for_tenant(
|
||||
debug!("Tenant {tenant_id} has no timelines loaded");
|
||||
}
|
||||
|
||||
Ok((tenant_id, tenant_timelines))
|
||||
Ok((tenant_id, TenantAttachData::Ready(tenant_timelines)))
|
||||
}
|
||||
|
||||
// discover timeline files and extract timeline metadata
|
||||
// NOTE: ephemeral files are excluded from the list
|
||||
fn collect_timeline_files(
|
||||
timeline_dir: &Path,
|
||||
) -> anyhow::Result<(TimelineId, TimelineMetadata, HashSet<PathBuf>)> {
|
||||
let mut timeline_files = HashSet::new();
|
||||
) -> anyhow::Result<(
|
||||
TimelineId,
|
||||
TimelineMetadata,
|
||||
HashMap<PathBuf, LayerFileMetadata>,
|
||||
)> {
|
||||
let mut timeline_files = HashMap::new();
|
||||
let mut timeline_metadata_path = None;
|
||||
|
||||
let timeline_id = timeline_dir
|
||||
@@ -673,7 +710,9 @@ fn collect_timeline_files(
|
||||
fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
|
||||
for entry in timeline_dir_entries {
|
||||
let entry_path = entry.context("Failed to list timeline dir entry")?.path();
|
||||
if entry_path.is_file() {
|
||||
let metadata = entry_path.metadata()?;
|
||||
|
||||
if metadata.is_file() {
|
||||
if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) {
|
||||
timeline_metadata_path = Some(entry_path);
|
||||
} else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
|
||||
@@ -688,7 +727,8 @@ fn collect_timeline_files(
|
||||
)
|
||||
})?;
|
||||
} else {
|
||||
timeline_files.insert(entry_path);
|
||||
let layer_metadata = LayerFileMetadata::new(metadata.len());
|
||||
timeline_files.insert(entry_path, layer_metadata);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,8 +70,10 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
// Run compaction
|
||||
let mut sleep_duration = tenant.get_compaction_period();
|
||||
if let Err(e) = tenant.compaction_iteration() {
|
||||
error!("Compaction failed, retrying: {e:#}");
|
||||
sleep_duration = wait_duration;
|
||||
error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
|
||||
#[cfg(feature = "testing")]
|
||||
std::process::abort();
|
||||
}
|
||||
|
||||
// Sleep
|
||||
@@ -119,8 +121,10 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
if gc_horizon > 0 {
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false)
|
||||
{
|
||||
error!("Gc failed, retrying: {e:#}");
|
||||
sleep_duration = wait_duration;
|
||||
error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
|
||||
#[cfg(feature = "testing")]
|
||||
std::process::abort();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,8 @@ use utils::crashsafe_dir::path_with_suffix_extension;
|
||||
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
|
||||
|
||||
use crate::metrics::{
|
||||
WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME,
|
||||
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
|
||||
WAL_REDO_WAIT_TIME,
|
||||
};
|
||||
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
@@ -244,12 +245,23 @@ impl PostgresRedoManager {
|
||||
let end_time = Instant::now();
|
||||
let duration = end_time.duration_since(lock_time);
|
||||
|
||||
let len = records.len();
|
||||
let nbytes = records.iter().fold(0, |acumulator, record| {
|
||||
acumulator
|
||||
+ match &record.1 {
|
||||
NeonWalRecord::Postgres { rec, .. } => rec.len(),
|
||||
_ => unreachable!("Only PostgreSQL records are accepted in this batch"),
|
||||
}
|
||||
});
|
||||
|
||||
WAL_REDO_TIME.observe(duration.as_secs_f64());
|
||||
WAL_REDO_RECORDS_HISTOGRAM.observe(records.len() as f64);
|
||||
WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
|
||||
WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
|
||||
|
||||
debug!(
|
||||
"postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
|
||||
records.len(),
|
||||
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
|
||||
len,
|
||||
nbytes,
|
||||
duration.as_micros(),
|
||||
lsn
|
||||
);
|
||||
@@ -258,8 +270,9 @@ impl PostgresRedoManager {
|
||||
// next request will launch a new one.
|
||||
if result.is_err() {
|
||||
error!(
|
||||
"error applying {} WAL records to reconstruct page image at LSN {}",
|
||||
"error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}",
|
||||
records.len(),
|
||||
nbytes,
|
||||
lsn
|
||||
);
|
||||
let process = process_guard.take().unwrap();
|
||||
|
||||
@@ -10,51 +10,12 @@ struct WalProposerConn
|
||||
PGconn *pg_conn;
|
||||
bool is_nonblocking; /* whether the connection is non-blocking */
|
||||
char *recvbuf; /* last received data from
|
||||
* libpqprop_async_read */
|
||||
* walprop_async_read */
|
||||
};
|
||||
|
||||
/* Prototypes for exported functions */
|
||||
static char *libpqprop_error_message(WalProposerConn * conn);
|
||||
static WalProposerConnStatusType libpqprop_status(WalProposerConn * conn);
|
||||
static WalProposerConn * libpqprop_connect_start(char *conninfo);
|
||||
static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn * conn);
|
||||
static bool libpqprop_send_query(WalProposerConn * conn, char *query);
|
||||
static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn * conn);
|
||||
static pgsocket libpqprop_socket(WalProposerConn * conn);
|
||||
static int libpqprop_flush(WalProposerConn * conn);
|
||||
static void libpqprop_finish(WalProposerConn * conn);
|
||||
static PGAsyncReadResult libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount);
|
||||
static PGAsyncWriteResult libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size);
|
||||
static bool libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size);
|
||||
|
||||
static WalProposerFunctionsType PQWalProposerFunctions =
|
||||
{
|
||||
libpqprop_error_message,
|
||||
libpqprop_status,
|
||||
libpqprop_connect_start,
|
||||
libpqprop_connect_poll,
|
||||
libpqprop_send_query,
|
||||
libpqprop_get_query_result,
|
||||
libpqprop_socket,
|
||||
libpqprop_flush,
|
||||
libpqprop_finish,
|
||||
libpqprop_async_read,
|
||||
libpqprop_async_write,
|
||||
libpqprop_blocking_write,
|
||||
};
|
||||
|
||||
/* Module initialization */
|
||||
void
|
||||
pg_init_libpqwalproposer(void)
|
||||
{
|
||||
if (WalProposerFunctions != NULL)
|
||||
elog(ERROR, "libpqwalproposer already loaded");
|
||||
WalProposerFunctions = &PQWalProposerFunctions;
|
||||
}
|
||||
|
||||
/* Helper function */
|
||||
static bool
|
||||
ensure_nonblocking_status(WalProposerConn * conn, bool is_nonblocking)
|
||||
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
|
||||
{
|
||||
/* If we're already correctly blocking or nonblocking, all good */
|
||||
if (is_nonblocking == conn->is_nonblocking)
|
||||
@@ -69,14 +30,14 @@ ensure_nonblocking_status(WalProposerConn * conn, bool is_nonblocking)
|
||||
}
|
||||
|
||||
/* Exported function definitions */
|
||||
static char *
|
||||
libpqprop_error_message(WalProposerConn * conn)
|
||||
char *
|
||||
walprop_error_message(WalProposerConn *conn)
|
||||
{
|
||||
return PQerrorMessage(conn->pg_conn);
|
||||
}
|
||||
|
||||
static WalProposerConnStatusType
|
||||
libpqprop_status(WalProposerConn * conn)
|
||||
WalProposerConnStatusType
|
||||
walprop_status(WalProposerConn *conn)
|
||||
{
|
||||
switch (PQstatus(conn->pg_conn))
|
||||
{
|
||||
@@ -89,8 +50,8 @@ libpqprop_status(WalProposerConn * conn)
|
||||
}
|
||||
}
|
||||
|
||||
static WalProposerConn *
|
||||
libpqprop_connect_start(char *conninfo)
|
||||
WalProposerConn *
|
||||
walprop_connect_start(char *conninfo)
|
||||
{
|
||||
WalProposerConn *conn;
|
||||
PGconn *pg_conn;
|
||||
@@ -119,8 +80,8 @@ libpqprop_connect_start(char *conninfo)
|
||||
return conn;
|
||||
}
|
||||
|
||||
static WalProposerConnectPollStatusType
|
||||
libpqprop_connect_poll(WalProposerConn * conn)
|
||||
WalProposerConnectPollStatusType
|
||||
walprop_connect_poll(WalProposerConn *conn)
|
||||
{
|
||||
WalProposerConnectPollStatusType return_val;
|
||||
|
||||
@@ -160,8 +121,8 @@ libpqprop_connect_poll(WalProposerConn * conn)
|
||||
return return_val;
|
||||
}
|
||||
|
||||
static bool
|
||||
libpqprop_send_query(WalProposerConn * conn, char *query)
|
||||
bool
|
||||
walprop_send_query(WalProposerConn *conn, char *query)
|
||||
{
|
||||
/*
|
||||
* We need to be in blocking mode for sending the query to run without
|
||||
@@ -177,8 +138,8 @@ libpqprop_send_query(WalProposerConn * conn, char *query)
|
||||
return true;
|
||||
}
|
||||
|
||||
static WalProposerExecStatusType
|
||||
libpqprop_get_query_result(WalProposerConn * conn)
|
||||
WalProposerExecStatusType
|
||||
walprop_get_query_result(WalProposerConn *conn)
|
||||
{
|
||||
PGresult *result;
|
||||
WalProposerExecStatusType return_val;
|
||||
@@ -255,20 +216,20 @@ libpqprop_get_query_result(WalProposerConn * conn)
|
||||
return return_val;
|
||||
}
|
||||
|
||||
static pgsocket
|
||||
libpqprop_socket(WalProposerConn * conn)
|
||||
pgsocket
|
||||
walprop_socket(WalProposerConn *conn)
|
||||
{
|
||||
return PQsocket(conn->pg_conn);
|
||||
}
|
||||
|
||||
static int
|
||||
libpqprop_flush(WalProposerConn * conn)
|
||||
int
|
||||
walprop_flush(WalProposerConn *conn)
|
||||
{
|
||||
return (PQflush(conn->pg_conn));
|
||||
}
|
||||
|
||||
static void
|
||||
libpqprop_finish(WalProposerConn * conn)
|
||||
void
|
||||
walprop_finish(WalProposerConn *conn)
|
||||
{
|
||||
if (conn->recvbuf != NULL)
|
||||
PQfreemem(conn->recvbuf);
|
||||
@@ -282,8 +243,8 @@ libpqprop_finish(WalProposerConn * conn)
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*/
|
||||
static PGAsyncReadResult
|
||||
libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount)
|
||||
PGAsyncReadResult
|
||||
walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
{
|
||||
int result;
|
||||
|
||||
@@ -353,8 +314,8 @@ libpqprop_async_read(WalProposerConn * conn, char **buf, int *amount)
|
||||
}
|
||||
}
|
||||
|
||||
static PGAsyncWriteResult
|
||||
libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size)
|
||||
PGAsyncWriteResult
|
||||
walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
{
|
||||
int result;
|
||||
|
||||
@@ -408,8 +369,12 @@ libpqprop_async_write(WalProposerConn * conn, void const *buf, size_t size)
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size)
|
||||
/*
|
||||
* This function is very similar to walprop_async_write. For more
|
||||
* information, refer to the comments there.
|
||||
*/
|
||||
bool
|
||||
walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
{
|
||||
int result;
|
||||
|
||||
@@ -417,10 +382,6 @@ libpqprop_blocking_write(WalProposerConn * conn, void const *buf, size_t size)
|
||||
if (!ensure_nonblocking_status(conn, false))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Ths function is very similar to libpqprop_async_write. For more
|
||||
* information, refer to the comments there
|
||||
*/
|
||||
if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
|
||||
return false;
|
||||
|
||||
|
||||
@@ -32,7 +32,6 @@ void
|
||||
_PG_init(void)
|
||||
{
|
||||
pg_init_libpagestore();
|
||||
pg_init_libpqwalproposer();
|
||||
pg_init_walproposer();
|
||||
|
||||
EmitWarningsOnPlaceholders("neon");
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
#define NEON_H
|
||||
|
||||
extern void pg_init_libpagestore(void);
|
||||
extern void pg_init_libpqwalproposer(void);
|
||||
extern void pg_init_walproposer(void);
|
||||
|
||||
#endif /* NEON_H */
|
||||
|
||||
@@ -79,9 +79,6 @@ bool am_wal_proposer;
|
||||
char *neon_timeline_walproposer = NULL;
|
||||
char *neon_tenant_walproposer = NULL;
|
||||
|
||||
/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
|
||||
WalProposerFunctionsType *WalProposerFunctions = NULL;
|
||||
|
||||
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
|
||||
|
||||
static int n_safekeepers = 0;
|
||||
@@ -438,10 +435,6 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
char *sep;
|
||||
char *port;
|
||||
|
||||
/* Load the libpq-specific functions */
|
||||
if (WalProposerFunctions == NULL)
|
||||
elog(ERROR, "libpqwalproposer didn't initialize correctly");
|
||||
|
||||
load_file("libpqwalreceiver", false);
|
||||
if (WalReceiverFunctions == NULL)
|
||||
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
|
||||
@@ -1471,12 +1464,6 @@ SendProposerElected(Safekeeper *sk)
|
||||
*/
|
||||
th = &sk->voteResponse.termHistory;
|
||||
|
||||
/*
|
||||
* If any WAL is present on the sk, it must be authorized by some term.
|
||||
* OTOH, without any WAL there are no term swiches in the log.
|
||||
*/
|
||||
Assert((th->n_entries == 0) ==
|
||||
(sk->voteResponse.flushLsn == InvalidXLogRecPtr));
|
||||
/* We must start somewhere. */
|
||||
Assert(propTermHistory.n_entries >= 1);
|
||||
|
||||
|
||||
@@ -446,31 +446,31 @@ typedef enum
|
||||
} WalProposerConnStatusType;
|
||||
|
||||
/* Re-exported PQerrorMessage */
|
||||
typedef char *(*walprop_error_message_fn) (WalProposerConn * conn);
|
||||
extern char *walprop_error_message(WalProposerConn *conn);
|
||||
|
||||
/* Re-exported PQstatus */
|
||||
typedef WalProposerConnStatusType(*walprop_status_fn) (WalProposerConn * conn);
|
||||
extern WalProposerConnStatusType walprop_status(WalProposerConn *conn);
|
||||
|
||||
/* Re-exported PQconnectStart */
|
||||
typedef WalProposerConn * (*walprop_connect_start_fn) (char *conninfo);
|
||||
extern WalProposerConn * walprop_connect_start(char *conninfo);
|
||||
|
||||
/* Re-exported PQconectPoll */
|
||||
typedef WalProposerConnectPollStatusType(*walprop_connect_poll_fn) (WalProposerConn * conn);
|
||||
extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn);
|
||||
|
||||
/* Blocking wrapper around PQsendQuery */
|
||||
typedef bool (*walprop_send_query_fn) (WalProposerConn * conn, char *query);
|
||||
extern bool walprop_send_query(WalProposerConn *conn, char *query);
|
||||
|
||||
/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
|
||||
typedef WalProposerExecStatusType(*walprop_get_query_result_fn) (WalProposerConn * conn);
|
||||
extern WalProposerExecStatusType walprop_get_query_result(WalProposerConn *conn);
|
||||
|
||||
/* Re-exported PQsocket */
|
||||
typedef pgsocket (*walprop_socket_fn) (WalProposerConn * conn);
|
||||
extern pgsocket walprop_socket(WalProposerConn *conn);
|
||||
|
||||
/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
|
||||
typedef int (*walprop_flush_fn) (WalProposerConn * conn);
|
||||
extern int walprop_flush(WalProposerConn *conn);
|
||||
|
||||
/* Re-exported PQfinish */
|
||||
typedef void (*walprop_finish_fn) (WalProposerConn * conn);
|
||||
extern void walprop_finish(WalProposerConn *conn);
|
||||
|
||||
/*
|
||||
* Ergonomic wrapper around PGgetCopyData
|
||||
@@ -486,9 +486,7 @@ typedef void (*walprop_finish_fn) (WalProposerConn * conn);
|
||||
* performs a bit of extra checking work that's always required and is normally
|
||||
* somewhat verbose.
|
||||
*/
|
||||
typedef PGAsyncReadResult(*walprop_async_read_fn) (WalProposerConn * conn,
|
||||
char **buf,
|
||||
int *amount);
|
||||
extern PGAsyncReadResult walprop_async_read(WalProposerConn *conn, char **buf, int *amount);
|
||||
|
||||
/*
|
||||
* Ergonomic wrapper around PQputCopyData + PQflush
|
||||
@@ -497,69 +495,14 @@ typedef PGAsyncReadResult(*walprop_async_read_fn) (WalProposerConn * conn,
|
||||
*
|
||||
* For information on the meaning of return codes, refer to PGAsyncWriteResult.
|
||||
*/
|
||||
typedef PGAsyncWriteResult(*walprop_async_write_fn) (WalProposerConn * conn,
|
||||
void const *buf,
|
||||
size_t size);
|
||||
extern PGAsyncWriteResult walprop_async_write(WalProposerConn *conn, void const *buf, size_t size);
|
||||
|
||||
/*
|
||||
* Blocking equivalent to walprop_async_write_fn
|
||||
*
|
||||
* Returns 'true' if successful, 'false' on failure.
|
||||
*/
|
||||
typedef bool (*walprop_blocking_write_fn) (WalProposerConn * conn, void const *buf, size_t size);
|
||||
|
||||
/* All libpqwalproposer exported functions collected together. */
|
||||
typedef struct WalProposerFunctionsType
|
||||
{
|
||||
walprop_error_message_fn walprop_error_message;
|
||||
walprop_status_fn walprop_status;
|
||||
walprop_connect_start_fn walprop_connect_start;
|
||||
walprop_connect_poll_fn walprop_connect_poll;
|
||||
walprop_send_query_fn walprop_send_query;
|
||||
walprop_get_query_result_fn walprop_get_query_result;
|
||||
walprop_socket_fn walprop_socket;
|
||||
walprop_flush_fn walprop_flush;
|
||||
walprop_finish_fn walprop_finish;
|
||||
walprop_async_read_fn walprop_async_read;
|
||||
walprop_async_write_fn walprop_async_write;
|
||||
walprop_blocking_write_fn walprop_blocking_write;
|
||||
} WalProposerFunctionsType;
|
||||
|
||||
/* Allow the above functions to be "called" with normal syntax */
|
||||
#define walprop_error_message(conn) \
|
||||
WalProposerFunctions->walprop_error_message(conn)
|
||||
#define walprop_status(conn) \
|
||||
WalProposerFunctions->walprop_status(conn)
|
||||
#define walprop_connect_start(conninfo) \
|
||||
WalProposerFunctions->walprop_connect_start(conninfo)
|
||||
#define walprop_connect_poll(conn) \
|
||||
WalProposerFunctions->walprop_connect_poll(conn)
|
||||
#define walprop_send_query(conn, query) \
|
||||
WalProposerFunctions->walprop_send_query(conn, query)
|
||||
#define walprop_get_query_result(conn) \
|
||||
WalProposerFunctions->walprop_get_query_result(conn)
|
||||
#define walprop_set_nonblocking(conn, arg) \
|
||||
WalProposerFunctions->walprop_set_nonblocking(conn, arg)
|
||||
#define walprop_socket(conn) \
|
||||
WalProposerFunctions->walprop_socket(conn)
|
||||
#define walprop_flush(conn) \
|
||||
WalProposerFunctions->walprop_flush(conn)
|
||||
#define walprop_finish(conn) \
|
||||
WalProposerFunctions->walprop_finish(conn)
|
||||
#define walprop_async_read(conn, buf, amount) \
|
||||
WalProposerFunctions->walprop_async_read(conn, buf, amount)
|
||||
#define walprop_async_write(conn, buf, size) \
|
||||
WalProposerFunctions->walprop_async_write(conn, buf, size)
|
||||
#define walprop_blocking_write(conn, buf, size) \
|
||||
WalProposerFunctions->walprop_blocking_write(conn, buf, size)
|
||||
|
||||
/*
|
||||
* The runtime location of the libpqwalproposer functions.
|
||||
*
|
||||
* This pointer is set by the initializer in libpqwalproposer, so that we
|
||||
* can use it later.
|
||||
*/
|
||||
extern PGDLLIMPORT WalProposerFunctionsType * WalProposerFunctions;
|
||||
extern bool walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size);
|
||||
|
||||
extern uint64 BackpressureThrottlingTime(void);
|
||||
|
||||
|
||||
@@ -5,11 +5,11 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
async-trait = "0.1"
|
||||
atty = "0.2.14"
|
||||
base64 = "0.13.0"
|
||||
bstr = "0.2.17"
|
||||
bstr = "1.0"
|
||||
bytes = { version = "1.0.1", features = ['serde'] }
|
||||
clap = "3.0"
|
||||
clap = "4.0"
|
||||
futures = "0.3.13"
|
||||
git-version = "0.3.5"
|
||||
hashbrown = "0.12"
|
||||
@@ -22,7 +22,11 @@ once_cell = "1.13.0"
|
||||
parking_lot = "0.12"
|
||||
pin-project-lite = "0.2.7"
|
||||
rand = "0.8.3"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
reqwest = { version = "0.11", default-features = false, features = [
|
||||
"blocking",
|
||||
"json",
|
||||
"rustls-tls",
|
||||
] }
|
||||
routerify = "3"
|
||||
rustls = "0.20.0"
|
||||
rustls-pemfile = "1"
|
||||
@@ -33,17 +37,20 @@ sha2 = "0.10.2"
|
||||
socket2 = "0.4.4"
|
||||
thiserror = "1.0.30"
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-rustls = "0.23.0"
|
||||
tracing = "0.1.36"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
url = "2.2.2"
|
||||
uuid = { version = "0.8.2", features = ["v4", "serde"]}
|
||||
x509-parser = "0.13.2"
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
x509-parser = "0.14"
|
||||
|
||||
utils = { path = "../libs/utils" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
rcgen = "0.8.14"
|
||||
rstest = "0.12"
|
||||
async-trait = "0.1"
|
||||
rcgen = "0.10"
|
||||
rstest = "0.15"
|
||||
tokio-postgres-rustls = "0.9.0"
|
||||
|
||||
@@ -15,6 +15,7 @@ use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::borrow::Cow;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
|
||||
static CPLANE_WAITERS: Lazy<Waiters<mgmt::ComputeReady>> = Lazy::new(Default::default);
|
||||
|
||||
@@ -171,6 +172,8 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
// support SNI or other means of passing the project name.
|
||||
// We now expect to see a very specific payload in the place of password.
|
||||
if creds.project().is_none() {
|
||||
warn!("project name not specified, resorting to the password hack auth flow");
|
||||
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
.await?
|
||||
@@ -179,6 +182,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
|
||||
// Finally we may finish the initialization of `creds`.
|
||||
// TODO: add missing type safety to ClientCredentials.
|
||||
info!(project = &payload.project, "received missing parameter");
|
||||
creds.project = Some(payload.project.into());
|
||||
|
||||
let mut config = match &self {
|
||||
@@ -196,6 +200,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
// We should use a password from payload as well.
|
||||
config.password(payload.password);
|
||||
|
||||
info!("user successfully authenticated (using the password hack)");
|
||||
return Ok(compute::NodeInfo {
|
||||
reported_auth_ok: false,
|
||||
config,
|
||||
@@ -203,19 +208,31 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
}
|
||||
}
|
||||
|
||||
match self {
|
||||
let res = match self {
|
||||
Console(endpoint, creds) => {
|
||||
info!(
|
||||
user = creds.user,
|
||||
project = creds.project(),
|
||||
"performing authentication using the console"
|
||||
);
|
||||
console::Api::new(&endpoint, extra, &creds)
|
||||
.handle_user(client)
|
||||
.await
|
||||
}
|
||||
Postgres(endpoint, creds) => {
|
||||
info!("performing mock authentication using a local postgres instance");
|
||||
postgres::Api::new(&endpoint, &creds)
|
||||
.handle_user(client)
|
||||
.await
|
||||
}
|
||||
// NOTE: this auth backend doesn't use client credentials.
|
||||
Link(url) => link::handle_user(&url, client).await,
|
||||
}
|
||||
Link(url) => {
|
||||
info!("performing link authentication");
|
||||
link::handle_user(&url, client).await
|
||||
}
|
||||
}?;
|
||||
|
||||
info!("user successfully authenticated");
|
||||
Ok(res)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,35 +8,20 @@ use crate::{
|
||||
http, scram,
|
||||
stream::PqStream,
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::future::Future;
|
||||
use thiserror::Error;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{error, info, info_span};
|
||||
|
||||
const REQUEST_FAILED: &str = "Console request failed";
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum TransportError {
|
||||
#[error("Console responded with a malformed JSON: {0}")]
|
||||
BadResponse(#[from] serde_json::Error),
|
||||
#[error("{}", REQUEST_FAILED)]
|
||||
pub struct TransportError(#[from] std::io::Error);
|
||||
|
||||
/// HTTP status (other than 200) returned by the console.
|
||||
#[error("Console responded with an HTTP status: {0}")]
|
||||
HttpStatus(reqwest::StatusCode),
|
||||
|
||||
#[error(transparent)]
|
||||
Io(#[from] std::io::Error),
|
||||
}
|
||||
|
||||
impl UserFacingError for TransportError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use TransportError::*;
|
||||
match self {
|
||||
HttpStatus(_) => self.to_string(),
|
||||
_ => REQUEST_FAILED.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl UserFacingError for TransportError {}
|
||||
|
||||
// Helps eliminate graceless `.map_err` calls without introducing another ctor.
|
||||
impl From<reqwest::Error> for TransportError {
|
||||
@@ -148,10 +133,11 @@ impl<'a> Api<'a> {
|
||||
}
|
||||
|
||||
async fn get_auth_info(&self) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
let req = self
|
||||
.endpoint
|
||||
.get("proxy_get_role_secret")
|
||||
.header("X-Request-ID", uuid::Uuid::new_v4().to_string())
|
||||
.header("X-Request-ID", &request_id)
|
||||
.query(&[("session_id", self.extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", self.extra.application_name),
|
||||
@@ -160,27 +146,30 @@ impl<'a> Api<'a> {
|
||||
])
|
||||
.build()?;
|
||||
|
||||
// TODO: use a proper logger
|
||||
println!("cplane request: {}", req.url());
|
||||
let span = info_span!("http", id = request_id, url = req.url().as_str());
|
||||
info!(parent: &span, "request auth info");
|
||||
let msg = self
|
||||
.endpoint
|
||||
.checked_execute(req)
|
||||
.and_then(|r| r.json::<GetRoleSecretResponse>())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!(parent: &span, "{e}");
|
||||
e
|
||||
})?;
|
||||
|
||||
let resp = self.endpoint.execute(req).await?;
|
||||
if !resp.status().is_success() {
|
||||
return Err(TransportError::HttpStatus(resp.status()).into());
|
||||
}
|
||||
|
||||
let response: GetRoleSecretResponse = serde_json::from_str(&resp.text().await?)?;
|
||||
|
||||
scram::ServerSecret::parse(&response.role_secret)
|
||||
scram::ServerSecret::parse(&msg.role_secret)
|
||||
.map(AuthInfo::Scram)
|
||||
.ok_or(GetAuthInfoError::BadSecret)
|
||||
}
|
||||
|
||||
/// Wake up the compute node and return the corresponding connection info.
|
||||
pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
let req = self
|
||||
.endpoint
|
||||
.get("proxy_wake_compute")
|
||||
.header("X-Request-ID", uuid::Uuid::new_v4().to_string())
|
||||
.header("X-Request-ID", &request_id)
|
||||
.query(&[("session_id", self.extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", self.extra.application_name),
|
||||
@@ -188,19 +177,21 @@ impl<'a> Api<'a> {
|
||||
])
|
||||
.build()?;
|
||||
|
||||
// TODO: use a proper logger
|
||||
println!("cplane request: {}", req.url());
|
||||
|
||||
let resp = self.endpoint.execute(req).await?;
|
||||
if !resp.status().is_success() {
|
||||
return Err(TransportError::HttpStatus(resp.status()).into());
|
||||
}
|
||||
|
||||
let response: GetWakeComputeResponse = serde_json::from_str(&resp.text().await?)?;
|
||||
let span = info_span!("http", id = request_id, url = req.url().as_str());
|
||||
info!(parent: &span, "request wake-up");
|
||||
let msg = self
|
||||
.endpoint
|
||||
.checked_execute(req)
|
||||
.and_then(|r| r.json::<GetWakeComputeResponse>())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!(parent: &span, "{e}");
|
||||
e
|
||||
})?;
|
||||
|
||||
// Unfortunately, ownership won't let us use `Option::ok_or` here.
|
||||
let (host, port) = match parse_host_port(&response.address) {
|
||||
None => return Err(WakeComputeError::BadComputeAddress(response.address)),
|
||||
let (host, port) = match parse_host_port(&msg.address) {
|
||||
None => return Err(WakeComputeError::BadComputeAddress(msg.address)),
|
||||
Some(x) => x,
|
||||
};
|
||||
|
||||
@@ -227,15 +218,18 @@ where
|
||||
GetAuthInfo: Future<Output = Result<AuthInfo, GetAuthInfoError>>,
|
||||
WakeCompute: Future<Output = Result<ComputeConnCfg, WakeComputeError>>,
|
||||
{
|
||||
info!("fetching user's authentication info");
|
||||
let auth_info = get_auth_info(endpoint).await?;
|
||||
|
||||
let flow = AuthFlow::new(client);
|
||||
let scram_keys = match auth_info {
|
||||
AuthInfo::Md5(_) => {
|
||||
// TODO: decide if we should support MD5 in api v2
|
||||
info!("auth endpoint chooses MD5");
|
||||
return Err(auth::AuthError::bad_auth_method("MD5"));
|
||||
}
|
||||
AuthInfo::Scram(secret) => {
|
||||
info!("auth endpoint chooses SCRAM");
|
||||
let scram = auth::Scram(&secret);
|
||||
Some(compute::ScramKeys {
|
||||
client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(),
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters};
|
||||
use thiserror::Error;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, info_span};
|
||||
use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
@@ -50,17 +51,20 @@ pub async fn handle_user(
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<compute::NodeInfo> {
|
||||
let psql_session_id = new_psql_session_id();
|
||||
let span = info_span!("link", psql_session_id = &psql_session_id);
|
||||
let greeting = hello_message(link_uri, &psql_session_id);
|
||||
|
||||
let db_info = super::with_waiter(psql_session_id, |waiter| async {
|
||||
// Give user a URL to spawn a new database
|
||||
// Give user a URL to spawn a new database.
|
||||
info!(parent: &span, "sending the auth URL to the user");
|
||||
client
|
||||
.write_message_noflush(&Be::AuthenticationOk)?
|
||||
.write_message_noflush(&BeParameterStatusMessage::encoding())?
|
||||
.write_message(&Be::NoticeResponse(&greeting))
|
||||
.await?;
|
||||
|
||||
// Wait for web console response (see `mgmt`)
|
||||
// Wait for web console response (see `mgmt`).
|
||||
info!(parent: &span, "waiting for console's reply...");
|
||||
waiter.await?.map_err(LinkAuthError::AuthFailed)
|
||||
})
|
||||
.await?;
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
use crate::error::UserFacingError;
|
||||
use std::borrow::Cow;
|
||||
use thiserror::Error;
|
||||
use tracing::info;
|
||||
use utils::pq_proto::StartupMessageParams;
|
||||
|
||||
#[derive(Debug, Error, PartialEq, Eq, Clone)]
|
||||
@@ -82,6 +83,13 @@ impl<'a> ClientCredentials<'a> {
|
||||
}
|
||||
.transpose()?;
|
||||
|
||||
info!(
|
||||
user = user,
|
||||
dbname = dbname,
|
||||
project = project.as_deref(),
|
||||
"credentials"
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
user,
|
||||
dbname,
|
||||
|
||||
@@ -4,6 +4,7 @@ use parking_lot::Mutex;
|
||||
use std::net::SocketAddr;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_postgres::{CancelToken, NoTls};
|
||||
use tracing::info;
|
||||
use utils::pq_proto::CancelKeyData;
|
||||
|
||||
/// Enables serving `CancelRequest`s.
|
||||
@@ -18,8 +19,9 @@ impl CancelMap {
|
||||
.lock()
|
||||
.get(&key)
|
||||
.and_then(|x| x.clone())
|
||||
.with_context(|| format!("unknown session: {:?}", key))?;
|
||||
.with_context(|| format!("query cancellation key not found: {key}"))?;
|
||||
|
||||
info!("cancelling query per user's request using key {key}");
|
||||
cancel_closure.try_cancel_query().await
|
||||
}
|
||||
|
||||
@@ -41,14 +43,16 @@ impl CancelMap {
|
||||
self.0
|
||||
.lock()
|
||||
.try_insert(key, None)
|
||||
.map_err(|_| anyhow!("session already exists: {:?}", key))?;
|
||||
.map_err(|_| anyhow!("query cancellation key already exists: {key}"))?;
|
||||
|
||||
// This will guarantee that the session gets dropped
|
||||
// as soon as the future is finished.
|
||||
scopeguard::defer! {
|
||||
self.0.lock().remove(&key);
|
||||
info!("dropped query cancellation key {key}");
|
||||
}
|
||||
|
||||
info!("registered new query cancellation key {key}");
|
||||
let session = Session::new(key, self);
|
||||
f(session).await
|
||||
}
|
||||
@@ -102,10 +106,13 @@ impl<'a> Session<'a> {
|
||||
fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self {
|
||||
Self { key, cancel_map }
|
||||
}
|
||||
}
|
||||
|
||||
impl Session<'_> {
|
||||
/// Store the cancel token for the given session.
|
||||
/// This enables query cancellation in [`crate::proxy::handshake`].
|
||||
pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
|
||||
info!("enabling query cancellation for this session");
|
||||
self.cancel_map
|
||||
.0
|
||||
.lock()
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::{io, net::SocketAddr};
|
||||
use thiserror::Error;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_postgres::NoTls;
|
||||
use tracing::{error, info};
|
||||
use utils::pq_proto::StartupMessageParams;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
@@ -54,6 +55,7 @@ impl NodeInfo {
|
||||
use tokio_postgres::config::Host;
|
||||
|
||||
let connect_once = |host, port| {
|
||||
info!("trying to connect to a compute node at {host}:{port}");
|
||||
TcpStream::connect((host, port)).and_then(|socket| async {
|
||||
let socket_addr = socket.peer_addr()?;
|
||||
// This prevents load balancer from severing the connection.
|
||||
@@ -72,7 +74,11 @@ impl NodeInfo {
|
||||
if ports.len() > 1 && ports.len() != hosts.len() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("couldn't connect: bad compute config, ports and hosts entries' count does not match: {:?}", self.config),
|
||||
format!(
|
||||
"couldn't connect: bad compute config, \
|
||||
ports and hosts entries' count does not match: {:?}",
|
||||
self.config
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
@@ -88,7 +94,7 @@ impl NodeInfo {
|
||||
Ok(socket) => return Ok(socket),
|
||||
Err(err) => {
|
||||
// We can't throw an error here, as there might be more hosts to try.
|
||||
println!("failed to connect to compute `{host}:{port}`: {err}");
|
||||
error!("failed to connect to a compute node at {host}:{port}: {err}");
|
||||
connection_error = Some(err);
|
||||
}
|
||||
}
|
||||
@@ -160,8 +166,8 @@ impl NodeInfo {
|
||||
.ok_or(ConnectionError::FailedToFetchPgVersion)?
|
||||
.into();
|
||||
|
||||
info!("connected to user's compute node at {socket_addr}");
|
||||
let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());
|
||||
|
||||
let db = PostgresConnection { stream, version };
|
||||
|
||||
Ok((db, cancel_closure))
|
||||
|
||||
@@ -17,6 +17,7 @@ impl Endpoint {
|
||||
Self { endpoint, client }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn url(&self) -> &ApiUrl {
|
||||
&self.endpoint
|
||||
}
|
||||
@@ -36,6 +37,16 @@ impl Endpoint {
|
||||
) -> Result<reqwest::Response, reqwest::Error> {
|
||||
self.client.execute(request).await
|
||||
}
|
||||
|
||||
/// Execute a [request](reqwest::Request) and raise an error if status != 200.
|
||||
pub async fn checked_execute(
|
||||
&self,
|
||||
request: reqwest::Request,
|
||||
) -> Result<reqwest::Response, reqwest::Error> {
|
||||
self.execute(request)
|
||||
.await
|
||||
.and_then(|r| r.error_for_status())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use anyhow::anyhow;
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use std::net::TcpListener;
|
||||
use tracing::info;
|
||||
use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService};
|
||||
|
||||
async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -12,9 +13,9 @@ fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
||||
router.get("/v1/status", status_handler)
|
||||
}
|
||||
|
||||
pub async fn thread_main(http_listener: TcpListener) -> anyhow::Result<()> {
|
||||
pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
println!("http has shut down");
|
||||
info!("http has shut down");
|
||||
}
|
||||
|
||||
let service = || RouterService::new(make_router().build()?);
|
||||
|
||||
@@ -23,8 +23,10 @@ use anyhow::{bail, Context};
|
||||
use clap::{self, Arg};
|
||||
use config::ProxyConfig;
|
||||
use futures::FutureExt;
|
||||
use metrics::set_build_info_metric;
|
||||
use std::{borrow::Cow, future::Future, net::SocketAddr};
|
||||
use tokio::{net::TcpListener, task::JoinError};
|
||||
use tracing::info;
|
||||
use utils::project_git_version;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
@@ -38,98 +40,48 @@ async fn flatten_err(
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = clap::App::new("Neon proxy/router")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("proxy")
|
||||
.short('p')
|
||||
.long("proxy")
|
||||
.takes_value(true)
|
||||
.help("listen for incoming client connections on ip:port")
|
||||
.default_value("127.0.0.1:4432"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("auth-backend")
|
||||
.long("auth-backend")
|
||||
.takes_value(true)
|
||||
.possible_values(["console", "postgres", "link"])
|
||||
.default_value("link"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("mgmt")
|
||||
.short('m')
|
||||
.long("mgmt")
|
||||
.takes_value(true)
|
||||
.help("listen for management callback connection on ip:port")
|
||||
.default_value("127.0.0.1:7000"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("http")
|
||||
.short('h')
|
||||
.long("http")
|
||||
.takes_value(true)
|
||||
.help("listen for incoming http connections (metrics, etc) on ip:port")
|
||||
.default_value("127.0.0.1:7001"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("uri")
|
||||
.short('u')
|
||||
.long("uri")
|
||||
.takes_value(true)
|
||||
.help("redirect unauthenticated users to the given uri in case of link auth")
|
||||
.default_value("http://localhost:3000/psql_session/"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("auth-endpoint")
|
||||
.short('a')
|
||||
.long("auth-endpoint")
|
||||
.takes_value(true)
|
||||
.help("cloud API endpoint for authenticating users")
|
||||
.default_value("http://localhost:3000/authenticate_proxy_request/"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("tls-key")
|
||||
.short('k')
|
||||
.long("tls-key")
|
||||
.alias("ssl-key") // backwards compatibility
|
||||
.takes_value(true)
|
||||
.help("path to TLS key for client postgres connections"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("tls-cert")
|
||||
.short('c')
|
||||
.long("tls-cert")
|
||||
.alias("ssl-cert") // backwards compatibility
|
||||
.takes_value(true)
|
||||
.help("path to TLS cert for client postgres connections"),
|
||||
)
|
||||
.get_matches();
|
||||
tracing_subscriber::fmt()
|
||||
.with_ansi(atty::is(atty::Stream::Stdout))
|
||||
.with_target(false)
|
||||
.init();
|
||||
|
||||
let arg_matches = cli().get_matches();
|
||||
|
||||
let tls_config = match (
|
||||
arg_matches.value_of("tls-key"),
|
||||
arg_matches.value_of("tls-cert"),
|
||||
arg_matches.get_one::<String>("tls-key"),
|
||||
arg_matches.get_one::<String>("tls-cert"),
|
||||
) {
|
||||
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?),
|
||||
(None, None) => None,
|
||||
_ => bail!("either both or neither tls-key and tls-cert must be specified"),
|
||||
};
|
||||
|
||||
let proxy_address: SocketAddr = arg_matches.value_of("proxy").unwrap().parse()?;
|
||||
let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?;
|
||||
let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?;
|
||||
let proxy_address: SocketAddr = arg_matches.get_one::<String>("proxy").unwrap().parse()?;
|
||||
let mgmt_address: SocketAddr = arg_matches.get_one::<String>("mgmt").unwrap().parse()?;
|
||||
let http_address: SocketAddr = arg_matches.get_one::<String>("http").unwrap().parse()?;
|
||||
|
||||
let auth_backend = match arg_matches.value_of("auth-backend").unwrap() {
|
||||
let auth_backend = match arg_matches
|
||||
.get_one::<String>("auth-backend")
|
||||
.unwrap()
|
||||
.as_str()
|
||||
{
|
||||
"console" => {
|
||||
let url = arg_matches.value_of("auth-endpoint").unwrap().parse()?;
|
||||
let url = arg_matches
|
||||
.get_one::<String>("auth-endpoint")
|
||||
.unwrap()
|
||||
.parse()?;
|
||||
let endpoint = http::Endpoint::new(url, reqwest::Client::new());
|
||||
auth::BackendType::Console(Cow::Owned(endpoint), ())
|
||||
}
|
||||
"postgres" => {
|
||||
let url = arg_matches.value_of("auth-endpoint").unwrap().parse()?;
|
||||
let url = arg_matches
|
||||
.get_one::<String>("auth-endpoint")
|
||||
.unwrap()
|
||||
.parse()?;
|
||||
auth::BackendType::Postgres(Cow::Owned(url), ())
|
||||
}
|
||||
"link" => {
|
||||
let url = arg_matches.value_of("uri").unwrap().parse()?;
|
||||
let url = arg_matches.get_one::<String>("uri").unwrap().parse()?;
|
||||
auth::BackendType::Link(Cow::Owned(url))
|
||||
}
|
||||
other => bail!("unsupported auth backend: {other}"),
|
||||
@@ -140,29 +92,95 @@ async fn main() -> anyhow::Result<()> {
|
||||
auth_backend,
|
||||
}));
|
||||
|
||||
println!("Version: {GIT_VERSION}");
|
||||
println!("Authentication backend: {}", config.auth_backend);
|
||||
info!("Version: {GIT_VERSION}");
|
||||
info!("Authentication backend: {}", config.auth_backend);
|
||||
|
||||
// Check that we can bind to address before further initialization
|
||||
println!("Starting http on {}", http_address);
|
||||
info!("Starting http on {http_address}");
|
||||
let http_listener = TcpListener::bind(http_address).await?.into_std()?;
|
||||
|
||||
println!("Starting mgmt on {}", mgmt_address);
|
||||
info!("Starting mgmt on {mgmt_address}");
|
||||
let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?;
|
||||
|
||||
println!("Starting proxy on {}", proxy_address);
|
||||
info!("Starting proxy on {proxy_address}");
|
||||
let proxy_listener = TcpListener::bind(proxy_address).await?;
|
||||
|
||||
let tasks = [
|
||||
tokio::spawn(http::server::thread_main(http_listener)),
|
||||
tokio::spawn(proxy::thread_main(config, proxy_listener)),
|
||||
tokio::spawn(http::server::task_main(http_listener)),
|
||||
tokio::spawn(proxy::task_main(config, proxy_listener)),
|
||||
tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)),
|
||||
]
|
||||
.map(flatten_err);
|
||||
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
// This will block until all tasks have completed.
|
||||
// Furthermore, the first one to fail will cancel the rest.
|
||||
let _: Vec<()> = futures::future::try_join_all(tasks).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cli() -> clap::Command {
|
||||
clap::Command::new("Neon proxy/router")
|
||||
.disable_help_flag(true)
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("proxy")
|
||||
.short('p')
|
||||
.long("proxy")
|
||||
.help("listen for incoming client connections on ip:port")
|
||||
.default_value("127.0.0.1:4432"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("auth-backend")
|
||||
.long("auth-backend")
|
||||
.value_parser(["console", "postgres", "link"])
|
||||
.default_value("link"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("mgmt")
|
||||
.short('m')
|
||||
.long("mgmt")
|
||||
.help("listen for management callback connection on ip:port")
|
||||
.default_value("127.0.0.1:7000"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("http")
|
||||
.long("http")
|
||||
.help("listen for incoming http connections (metrics, etc) on ip:port")
|
||||
.default_value("127.0.0.1:7001"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("uri")
|
||||
.short('u')
|
||||
.long("uri")
|
||||
.help("redirect unauthenticated users to the given uri in case of link auth")
|
||||
.default_value("http://localhost:3000/psql_session/"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("auth-endpoint")
|
||||
.short('a')
|
||||
.long("auth-endpoint")
|
||||
.help("cloud API endpoint for authenticating users")
|
||||
.default_value("http://localhost:3000/authenticate_proxy_request/"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("tls-key")
|
||||
.short('k')
|
||||
.long("tls-key")
|
||||
.alias("ssl-key") // backwards compatibility
|
||||
.help("path to TLS key for client postgres connections"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("tls-cert")
|
||||
.short('c')
|
||||
.long("tls-cert")
|
||||
.alias("ssl-cert") // backwards compatibility
|
||||
.help("path to TLS cert for client postgres connections"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_cli() {
|
||||
cli().debug_assert();
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::{
|
||||
net::{TcpListener, TcpStream},
|
||||
thread,
|
||||
};
|
||||
use tracing::{error, info};
|
||||
use utils::{
|
||||
postgres_backend::{self, AuthType, PostgresBackend},
|
||||
pq_proto::{BeMessage, SINGLE_COL_ROWDESC},
|
||||
@@ -19,7 +20,7 @@ use utils::{
|
||||
///
|
||||
pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
println!("mgmt has shut down");
|
||||
info!("mgmt has shut down");
|
||||
}
|
||||
|
||||
listener
|
||||
@@ -27,14 +28,14 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
|
||||
.context("failed to set listener to blocking")?;
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?;
|
||||
println!("accepted connection from {}", peer_addr);
|
||||
info!("accepted connection from {peer_addr}");
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("failed to set client socket option")?;
|
||||
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = handle_connection(socket) {
|
||||
println!("error: {}", err);
|
||||
error!("{err}");
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -102,14 +103,14 @@ impl postgres_backend::Handler for MgmtHandler {
|
||||
let res = try_process_query(pgb, query_string);
|
||||
// intercept and log error message
|
||||
if res.is_err() {
|
||||
println!("Mgmt query failed: #{:?}", res);
|
||||
error!("mgmt query failed: {res:?}");
|
||||
}
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::Result<()> {
|
||||
println!("Got mgmt query [redacted]"); // Content contains password, don't print it
|
||||
info!("got mgmt query [redacted]"); // Content contains password, don't print it
|
||||
|
||||
let resp: PsqlSessionResponse = serde_json::from_str(query_string)?;
|
||||
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
use crate::auth;
|
||||
use crate::cancellation::{self, CancelMap};
|
||||
use crate::config::{ProxyConfig, TlsConfig};
|
||||
use crate::stream::{MetricsStream, PqStream, Stream};
|
||||
use crate::stream::{MeasuredStream, PqStream, Stream};
|
||||
use anyhow::{bail, Context};
|
||||
use futures::TryFutureExt;
|
||||
use metrics::{register_int_counter, IntCounter};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{error, info, info_span, Instrument};
|
||||
use utils::pq_proto::{BeMessage as Be, *};
|
||||
|
||||
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
|
||||
@@ -43,17 +44,17 @@ where
|
||||
F: std::future::Future<Output = anyhow::Result<R>>,
|
||||
{
|
||||
future.await.map_err(|err| {
|
||||
println!("error: {}", err);
|
||||
error!("{err}");
|
||||
err
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn thread_main(
|
||||
pub async fn task_main(
|
||||
config: &'static ProxyConfig,
|
||||
listener: tokio::net::TcpListener,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
println!("proxy has shut down");
|
||||
info!("proxy has shut down");
|
||||
}
|
||||
|
||||
// When set for the server socket, the keepalive setting
|
||||
@@ -63,22 +64,29 @@ pub async fn thread_main(
|
||||
let cancel_map = Arc::new(CancelMap::default());
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept().await?;
|
||||
println!("accepted connection from {}", peer_addr);
|
||||
info!("accepted postgres client connection from {peer_addr}");
|
||||
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let cancel_map = Arc::clone(&cancel_map);
|
||||
tokio::spawn(log_error(async move {
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("failed to set socket option")?;
|
||||
tokio::spawn(
|
||||
log_error(async move {
|
||||
info!("spawned a task for {peer_addr}");
|
||||
|
||||
handle_client(config, &cancel_map, socket).await
|
||||
}));
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
handle_client(config, &cancel_map, session_id, socket).await
|
||||
})
|
||||
.instrument(info_span!("client", session = format_args!("{session_id}"))),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_client(
|
||||
config: &ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
session_id: uuid::Uuid,
|
||||
stream: impl AsyncRead + AsyncWrite + Unpin + Send,
|
||||
) -> anyhow::Result<()> {
|
||||
// The `closed` counter will increase when this future is destroyed.
|
||||
@@ -88,7 +96,8 @@ async fn handle_client(
|
||||
}
|
||||
|
||||
let tls = config.tls_config.as_ref();
|
||||
let (mut stream, params) = match handshake(stream, tls, cancel_map).await? {
|
||||
let do_handshake = handshake(stream, tls, cancel_map).instrument(info_span!("handshake"));
|
||||
let (mut stream, params) = match do_handshake.await? {
|
||||
Some(x) => x,
|
||||
None => return Ok(()), // it's a cancellation request
|
||||
};
|
||||
@@ -106,7 +115,7 @@ async fn handle_client(
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
};
|
||||
|
||||
let client = Client::new(stream, creds, ¶ms);
|
||||
let client = Client::new(stream, creds, ¶ms, session_id);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session))
|
||||
.await
|
||||
@@ -127,7 +136,7 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
let mut stream = PqStream::new(Stream::from_raw(stream));
|
||||
loop {
|
||||
let msg = stream.read_startup_packet().await?;
|
||||
println!("got message: {:?}", msg);
|
||||
info!("received {msg:?}");
|
||||
|
||||
use FeStartupPacket::*;
|
||||
match msg {
|
||||
@@ -164,11 +173,13 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
|
||||
}
|
||||
|
||||
info!(session_type = "normal", "successful handshake");
|
||||
break Ok(Some((stream, params)));
|
||||
}
|
||||
CancelRequest(cancel_key_data) => {
|
||||
cancel_map.cancel_session(cancel_key_data).await?;
|
||||
|
||||
info!(session_type = "cancellation", "successful handshake");
|
||||
break Ok(None);
|
||||
}
|
||||
}
|
||||
@@ -183,6 +194,8 @@ struct Client<'a, S> {
|
||||
creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
|
||||
/// KV-dictionary with PostgreSQL connection params.
|
||||
params: &'a StartupMessageParams,
|
||||
/// Unique connection ID.
|
||||
session_id: uuid::Uuid,
|
||||
}
|
||||
|
||||
impl<'a, S> Client<'a, S> {
|
||||
@@ -191,11 +204,13 @@ impl<'a, S> Client<'a, S> {
|
||||
stream: PqStream<S>,
|
||||
creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
|
||||
params: &'a StartupMessageParams,
|
||||
session_id: uuid::Uuid,
|
||||
) -> Self {
|
||||
Self {
|
||||
stream,
|
||||
creds,
|
||||
params,
|
||||
session_id,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -207,17 +222,20 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
|
||||
mut stream,
|
||||
creds,
|
||||
params,
|
||||
session_id,
|
||||
} = self;
|
||||
|
||||
let extra = auth::ConsoleReqExtra {
|
||||
// Currently it's OK to generate a new UUID **here**, but
|
||||
// it might be better to move this to `cancellation::Session`.
|
||||
session_id: uuid::Uuid::new_v4(),
|
||||
session_id, // aka this connection's id
|
||||
application_name: params.get("application_name"),
|
||||
};
|
||||
|
||||
// Authenticate and connect to a compute node.
|
||||
let auth = creds.authenticate(&extra, &mut stream).await;
|
||||
let auth = creds
|
||||
.authenticate(&extra, &mut stream)
|
||||
.instrument(info_span!("auth"))
|
||||
.await;
|
||||
|
||||
let node = async { auth }.or_else(|e| stream.throw_error(e)).await?;
|
||||
let reported_auth_ok = node.reported_auth_ok;
|
||||
|
||||
@@ -251,8 +269,9 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
|
||||
}
|
||||
|
||||
// Starting from here we only proxy the client's traffic.
|
||||
let mut db = MetricsStream::new(db.stream, inc_proxied);
|
||||
let mut client = MetricsStream::new(stream.into_inner(), inc_proxied);
|
||||
info!("performing the proxy pass...");
|
||||
let mut db = MeasuredStream::new(db.stream, inc_proxied);
|
||||
let mut client = MeasuredStream::new(stream.into_inner(), inc_proxied);
|
||||
let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?;
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -231,7 +231,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for Stream<S> {
|
||||
pin_project! {
|
||||
/// This stream tracks all writes and calls user provided
|
||||
/// callback when the underlying stream is flushed.
|
||||
pub struct MetricsStream<S, W> {
|
||||
pub struct MeasuredStream<S, W> {
|
||||
#[pin]
|
||||
stream: S,
|
||||
write_count: usize,
|
||||
@@ -239,7 +239,7 @@ pin_project! {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S, W> MetricsStream<S, W> {
|
||||
impl<S, W> MeasuredStream<S, W> {
|
||||
pub fn new(stream: S, inc_write_count: W) -> Self {
|
||||
Self {
|
||||
stream,
|
||||
@@ -249,7 +249,7 @@ impl<S, W> MetricsStream<S, W> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + Unpin, W> AsyncRead for MetricsStream<S, W> {
|
||||
impl<S: AsyncRead + Unpin, W> AsyncRead for MeasuredStream<S, W> {
|
||||
fn poll_read(
|
||||
self: Pin<&mut Self>,
|
||||
context: &mut task::Context<'_>,
|
||||
@@ -259,7 +259,7 @@ impl<S: AsyncRead + Unpin, W> AsyncRead for MetricsStream<S, W> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncWrite + Unpin, W: FnMut(usize)> AsyncWrite for MetricsStream<S, W> {
|
||||
impl<S: AsyncWrite + Unpin, W: FnMut(usize)> AsyncWrite for MeasuredStream<S, W> {
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
context: &mut task::Context<'_>,
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# avoid running regular linting script that checks every feature.
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
# no extra features to test currently, add more here when needed
|
||||
cargo clippy --locked --all --all-targets -- -A unknown_lints -D warnings
|
||||
cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings
|
||||
else
|
||||
# * `-A unknown_lints` – do not warn about unknown lint suppressions
|
||||
# that people with newer toolchains might use
|
||||
|
||||
@@ -11,7 +11,7 @@ hyper = "0.14"
|
||||
fs2 = "0.4.3"
|
||||
serde_json = "1"
|
||||
tracing = "0.1.27"
|
||||
clap = "3.0"
|
||||
clap = "4.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.17", features = ["macros", "fs"] }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
@@ -22,14 +22,14 @@ humantime = "2.1.0"
|
||||
url = "2.2.2"
|
||||
signal-hook = "0.3.10"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
serde_with = "2.0"
|
||||
hex = "0.4.3"
|
||||
const_format = "0.2.21"
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
git-version = "0.3.5"
|
||||
async-trait = "0.1"
|
||||
once_cell = "1.13.0"
|
||||
toml_edit = { version = "0.13", features = ["easy"] }
|
||||
toml_edit = { version = "0.14", features = ["easy"] }
|
||||
thiserror = "1"
|
||||
parking_lot = "0.12.1"
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user