mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-17 18:32:56 +00:00
Compare commits
121 Commits
file_page_
...
persistent
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8261455019 | ||
|
|
aad88d6c39 | ||
|
|
6188315b51 | ||
|
|
3a4b932d8a | ||
|
|
cc2b3c986c | ||
|
|
c250c2664b | ||
|
|
e5550a01b0 | ||
|
|
45617ceaef | ||
|
|
29b39301fe | ||
|
|
b01a93be60 | ||
|
|
4c68d019e3 | ||
|
|
85f0975c5a | ||
|
|
1af087449a | ||
|
|
37625c4433 | ||
|
|
e9f4ca5972 | ||
|
|
4bf3087aed | ||
|
|
9470bc9fe0 | ||
|
|
86e483f87b | ||
|
|
f50d0ec0c9 | ||
|
|
74ec36a1bf | ||
|
|
a63ebb6446 | ||
|
|
a5b898a31c | ||
|
|
c6f095a821 | ||
|
|
6b2bc7f775 | ||
|
|
6c97fc941a | ||
|
|
cb9b26776e | ||
|
|
684329d4d2 | ||
|
|
ed40a045c0 | ||
|
|
3f39327622 | ||
|
|
a50a7e8ac0 | ||
|
|
e28eda7939 | ||
|
|
f564dff0e3 | ||
|
|
d783889a1f | ||
|
|
2655bdbb2e | ||
|
|
b9152f1ef4 | ||
|
|
328ec1ce24 | ||
|
|
dcb79ef08f | ||
|
|
fd99e0fbc4 | ||
|
|
60ac227196 | ||
|
|
4a60051b0d | ||
|
|
24d3ed0952 | ||
|
|
0a87d71294 | ||
|
|
150bddb929 | ||
|
|
2b728bc69e | ||
|
|
5184685ced | ||
|
|
9ae4da4f31 | ||
|
|
aca221ac8b | ||
|
|
d013a2b227 | ||
|
|
3f93c6c6f0 | ||
|
|
53267969d7 | ||
|
|
c4b417ecdb | ||
|
|
1d105727cb | ||
|
|
4787a744c2 | ||
|
|
ac3ccac56c | ||
|
|
638af96c51 | ||
|
|
1e21ca1afe | ||
|
|
46d30bf054 | ||
|
|
d0105cea1f | ||
|
|
e44e4a699b | ||
|
|
223834a420 | ||
|
|
01778e37cc | ||
|
|
03190a2161 | ||
|
|
f87017c04d | ||
|
|
c11cbf0f5c | ||
|
|
f30ef00439 | ||
|
|
dbe5b52494 | ||
|
|
4131a6efae | ||
|
|
03695261fc | ||
|
|
7fd88fab59 | ||
|
|
7edc098c40 | ||
|
|
8421218152 | ||
|
|
d5b7832c21 | ||
|
|
c6072d38c2 | ||
|
|
175779c0ef | ||
|
|
8654e95fae | ||
|
|
f720dd735e | ||
|
|
c4f9f1dc6d | ||
|
|
4a10e1b066 | ||
|
|
b55466045e | ||
|
|
e999f66b01 | ||
|
|
1cf257bc4a | ||
|
|
40164bd589 | ||
|
|
c3a470a29b | ||
|
|
c1a76eb0e5 | ||
|
|
d5b6471fa9 | ||
|
|
548d472b12 | ||
|
|
99e745a760 | ||
|
|
15d970f731 | ||
|
|
7b7f84f1b4 | ||
|
|
bc40a5595f | ||
|
|
07b3ba5ce3 | ||
|
|
c38f38dab7 | ||
|
|
71d268c7c4 | ||
|
|
cf68963b18 | ||
|
|
63221e4b42 | ||
|
|
d7eeb73f6f | ||
|
|
5112142997 | ||
|
|
a0a74868a4 | ||
|
|
b154992510 | ||
|
|
a86a38c96e | ||
|
|
590f894db8 | ||
|
|
0a0595b98d | ||
|
|
e56d11c8e1 | ||
|
|
ccdc3188ed | ||
|
|
67401cbdb8 | ||
|
|
d42700280f | ||
|
|
6df4d5c911 | ||
|
|
32d14403bd | ||
|
|
0df3467146 | ||
|
|
c64a121aa8 | ||
|
|
22cc8760b9 | ||
|
|
596d622a82 | ||
|
|
7481fb082c | ||
|
|
1eb9bd052a | ||
|
|
59a3ca4ec6 | ||
|
|
e86a9105a4 | ||
|
|
d3c8749da5 | ||
|
|
128dc8d405 | ||
|
|
0cbae6e8f3 | ||
|
|
78e412b84b | ||
|
|
6dbf202e0d |
41
.github/actions/run-python-test-set/action.yml
vendored
41
.github/actions/run-python-test-set/action.yml
vendored
@@ -55,6 +55,22 @@ runs:
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
- name: Download Neon binaries for the previous release
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon-previous
|
||||
prefix: latest
|
||||
|
||||
- name: Download compatibility snapshot for Postgres 14
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg14
|
||||
path: /tmp/compatibility_snapshot_pg14
|
||||
prefix: latest
|
||||
|
||||
- name: Checkout
|
||||
if: inputs.needs_postgres_source == 'true'
|
||||
uses: actions/checkout@v3
|
||||
@@ -73,22 +89,18 @@ runs:
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Download compatibility snapshot for Postgres 14
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg14
|
||||
path: /tmp/compatibility_snapshot_pg14
|
||||
prefix: latest
|
||||
|
||||
- name: Run pytest
|
||||
env:
|
||||
NEON_BIN: /tmp/neon/bin
|
||||
COMPATIBILITY_NEON_BIN: /tmp/neon-previous/bin
|
||||
COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: ${{ inputs.build_type }}
|
||||
AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
|
||||
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
|
||||
ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
|
||||
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
|
||||
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# PLATFORM will be embedded in the perf test report
|
||||
@@ -111,7 +123,12 @@ runs:
|
||||
exit 1
|
||||
fi
|
||||
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
|
||||
# -n4 uses four processes to run tests via pytest-xdist
|
||||
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
|
||||
|
||||
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
|
||||
# to the same worker to make @pytest.mark.order work with xdist
|
||||
EXTRA_PARAMS="--dist=loadgroup $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
|
||||
@@ -146,9 +163,9 @@ runs:
|
||||
# --verbose prints name of each test (helpful when there are
|
||||
# multiple tests in one file)
|
||||
# -rA prints summary in the end
|
||||
# -n4 uses four processes to run tests via pytest-xdist
|
||||
# -s is not used to prevent pytest from capturing output, because tests are running
|
||||
# in parallel and logs are mixed between different tests
|
||||
#
|
||||
mkdir -p $TEST_OUTPUT/allure/results
|
||||
"${cov_prefix[@]}" ./scripts/pytest \
|
||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||
@@ -168,12 +185,12 @@ runs:
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
|
||||
# The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
|
||||
path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
|
||||
# The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
|
||||
path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
|
||||
prefix: latest
|
||||
|
||||
- name: Create Allure report
|
||||
if: always()
|
||||
if: success() || failure()
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: store
|
||||
|
||||
2
.github/ansible/.gitignore
vendored
2
.github/ansible/.gitignore
vendored
@@ -1,5 +1,3 @@
|
||||
zenith_install.tar.gz
|
||||
.zenith_current_version
|
||||
neon_install.tar.gz
|
||||
.neon_current_version
|
||||
|
||||
|
||||
35
.github/ansible/prod.ap-southeast-1.hosts.yaml
vendored
Normal file
35
.github/ansible/prod.ap-southeast-1.hosts.yaml
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-ap-southeast-1
|
||||
bucket_region: ap-southeast-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: ap-southeast-1
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
|
||||
console_region_id: aws-ap-southeast-1
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-064de8ea28bdb495b
|
||||
pageserver-1.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0b180defcaeeb6b93
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0d6f1dc5161eef894
|
||||
safekeeper-1.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0e338adda8eb2d19f
|
||||
safekeeper-2.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-04fb63634e4679eb9
|
||||
35
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
Normal file
35
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-eu-central-1
|
||||
bucket_region: eu-central-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: eu-central-1
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
|
||||
console_region_id: aws-eu-central-1
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-0cd8d316ecbb715be
|
||||
pageserver-1.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-090044ed3d383fef0
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-0b238612d2318a050
|
||||
safekeeper-1.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-07b9c45e5c2637cd4
|
||||
safekeeper-2.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-020257302c3c93d88
|
||||
36
.github/ansible/prod.us-east-2.hosts.yaml
vendored
Normal file
36
.github/ansible/prod.us-east-2.hosts.yaml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: us-east-2
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
|
||||
console_region_id: aws-us-east-2
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-062227ba7f119eb8c
|
||||
pageserver-1.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-0b3ec0afab5968938
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-0e94224750c57d346
|
||||
safekeeper-1.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-06d113fb73bfddeb0
|
||||
safekeeper-2.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-09f66c8e04afff2e8
|
||||
|
||||
4
.github/ansible/production.hosts.yaml
vendored
4
.github/ansible/production.hosts.yaml
vendored
@@ -22,6 +22,10 @@ storage:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-3:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-4:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-5:
|
||||
console_region_id: aws-us-west-2
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
|
||||
1
.github/ansible/ssm_config
vendored
1
.github/ansible/ssm_config
vendored
@@ -1,3 +1,2 @@
|
||||
ansible_connection: aws_ssm
|
||||
ansible_aws_ssm_bucket_name: neon-dev-bucket
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
|
||||
33
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
Normal file
33
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-dev-storage-eu-west-1
|
||||
bucket_region: eu-west-1
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
etcd_endpoints: etcd-0.eu-west-1.aws.neon.build:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: eu-west-1
|
||||
ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
|
||||
console_region_id: aws-eu-west-1
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-01d496c5041c7f34c
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-05226ef85722831bf
|
||||
safekeeper-1.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-06969ee1bf2958bfc
|
||||
safekeeper-2.eu-west-1.aws.neon.build:
|
||||
ansible_host: i-087892e9625984a0b
|
||||
2
.github/ansible/staging.hosts.yaml
vendored
2
.github/ansible/staging.hosts.yaml
vendored
@@ -3,7 +3,7 @@ storage:
|
||||
bucket_name: zenith-staging-storage-us-east-1
|
||||
bucket_region: us-east-1
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
etcd_endpoints: zenith-us-stage-etcd.local:2379
|
||||
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
|
||||
3
.github/ansible/staging.us-east-2.hosts.yaml
vendored
3
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -14,6 +14,7 @@ storage:
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: us-east-2
|
||||
ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
|
||||
console_region_id: aws-us-east-2
|
||||
|
||||
children:
|
||||
@@ -21,6 +22,8 @@ storage:
|
||||
hosts:
|
||||
pageserver-0.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0c3e70929edb5d691
|
||||
pageserver-1.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0565a8b4008aa3f40
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
|
||||
31
.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
vendored
Normal file
31
.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.eu-west-1.aws.neon.build"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: dev
|
||||
zenith_region: eu-west-1
|
||||
zenith_region_slug: eu-west-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
31
.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
vendored
Normal file
31
.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: ap-southeast-1
|
||||
zenith_region_slug: ap-southeast-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
31
.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
vendored
Normal file
31
.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.eu-central-1.aws.neon.tech"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: eu-central-1
|
||||
zenith_region_slug: eu-central-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
31
.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
vendored
Normal file
31
.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.tech"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
13
.github/workflows/benchmarking.yml
vendored
13
.github/workflows/benchmarking.yml
vendored
@@ -144,7 +144,9 @@ jobs:
|
||||
# neon-captest-new: Run pgbench in a freshly created project
|
||||
# neon-captest-reuse: Same, but reusing existing project
|
||||
# neon-captest-prefetch: Same, with prefetching enabled (new project)
|
||||
platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch ]
|
||||
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
|
||||
db_size: [ 10gb ]
|
||||
include:
|
||||
- platform: neon-captest-new
|
||||
@@ -164,7 +166,7 @@ jobs:
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
|
||||
options: --init
|
||||
@@ -207,8 +209,11 @@ jobs:
|
||||
rds-aurora)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }}
|
||||
;;
|
||||
rds-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch' or 'rds-aurora'"
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -265,7 +270,7 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Create Allure report
|
||||
if: always()
|
||||
if: success() || failure()
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
|
||||
313
.github/workflows/build_and_test.yml
vendored
313
.github/workflows/build_and_test.yml
vendored
@@ -18,8 +18,8 @@ env:
|
||||
|
||||
jobs:
|
||||
tag:
|
||||
runs-on: dev
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
outputs:
|
||||
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||
|
||||
@@ -46,7 +46,7 @@ jobs:
|
||||
id: build-tag
|
||||
|
||||
build-neon:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
@@ -127,8 +127,8 @@ jobs:
|
||||
target/
|
||||
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||
key: |
|
||||
v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-
|
||||
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
@@ -236,7 +236,7 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
@@ -268,34 +268,8 @@ jobs:
|
||||
if: matrix.build_type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
upload-latest-artifacts:
|
||||
runs-on: dev
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ regress-tests ]
|
||||
if: github.ref_name == 'main'
|
||||
steps:
|
||||
- name: Copy Neon artifact to the latest directory
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
PREFIX: artifacts/${{ github.run_id }}
|
||||
run: |
|
||||
for build_type in debug release; do
|
||||
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME}
|
||||
done
|
||||
|
||||
benchmarks:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
@@ -326,12 +300,12 @@ jobs:
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
merge-allure-report:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ regress-tests, benchmarks ]
|
||||
if: always()
|
||||
if: success() || failure()
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -364,7 +338,7 @@ jobs:
|
||||
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
|
||||
|
||||
coverage-report:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
@@ -389,7 +363,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git/
|
||||
target/
|
||||
key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
- name: Get Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -441,15 +415,19 @@ jobs:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
trigger-e2e-tests:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
needs: [ build-neon ]
|
||||
needs: [ push-docker-hub, tag ]
|
||||
steps:
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
run: |
|
||||
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
|
||||
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
|
||||
# to place a job run status update later.
|
||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
|
||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||
|
||||
REMOTE_REPO="${{ github.repository_owner }}/cloud"
|
||||
@@ -475,12 +453,14 @@ jobs:
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\"
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
neon-image:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs: [ tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
|
||||
@@ -498,7 +478,7 @@ jobs:
|
||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs: [ tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
|
||||
@@ -512,28 +492,8 @@ jobs:
|
||||
- name: Kaniko build compute tools
|
||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
compute-node-image:
|
||||
runs-on: dev
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
needs: [ tag ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
# compute-node uses postgres 14, which is default now
|
||||
# cloud repo depends on this image name, thus duplicating it
|
||||
# remove compute-node when cloud repo is updated
|
||||
- name: Kaniko build compute node with extensions v14 (compatibility)
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
compute-node-image-v14:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
needs: [ tag ]
|
||||
steps:
|
||||
@@ -549,9 +509,8 @@ jobs:
|
||||
- name: Kaniko build compute node with extensions v14
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
|
||||
compute-node-image-v15:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
needs: [ tag ]
|
||||
steps:
|
||||
@@ -567,18 +526,58 @@ jobs:
|
||||
- name: Kaniko build compute node with extensions v15
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
test-images:
|
||||
needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
# `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
|
||||
# Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
|
||||
# Regular pageserver version string looks like
|
||||
# Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: []
|
||||
# Bad versions might loop like:
|
||||
# Neon page server git-env:local failpoints: true, features: ["testing"]
|
||||
# Ensure that we don't have bad versions.
|
||||
- name: Verify image versions
|
||||
shell: bash # ensure no set -e for better error messages
|
||||
run: |
|
||||
pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
|
||||
|
||||
echo "Pageserver version string: $pageserver_version"
|
||||
|
||||
if ! echo "$pageserver_version" | grep -qv 'git-env:local' ; then
|
||||
echo "Pageserver version should not be the default Dockerfile one"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! echo "$pageserver_version" | grep -qv '"testing"' ; then
|
||||
echo "Pageserver version should have no testing feature enabled"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify docker-compose example
|
||||
run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
|
||||
|
||||
- name: Print logs and clean up
|
||||
if: always()
|
||||
run: |
|
||||
docker compose -f ./docker-compose/docker-compose.yml logs || 0
|
||||
docker compose -f ./docker-compose/docker-compose.yml down
|
||||
|
||||
promote-images:
|
||||
runs-on: dev
|
||||
needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs: [ tag, test-images ]
|
||||
if: github.event_name != 'workflow_dispatch'
|
||||
container: amazon/aws-cli
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# compute-node uses postgres 14, which is default now
|
||||
# cloud repo depends on this image name, thus duplicating it
|
||||
# remove compute-node when cloud repo is updated
|
||||
name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ]
|
||||
name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]
|
||||
|
||||
steps:
|
||||
- name: Promote image to latest
|
||||
@@ -587,7 +586,7 @@ jobs:
|
||||
aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
|
||||
|
||||
push-docker-hub:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs: [ promote-images, tag ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
@@ -608,9 +607,6 @@ jobs:
|
||||
- name: Pull compute tools image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools
|
||||
|
||||
- name: Pull compute node image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node
|
||||
|
||||
- name: Pull compute node v14 image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
|
||||
|
||||
@@ -625,11 +621,10 @@ jobs:
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
run: |
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
|
||||
|
||||
- name: Configure Docker Hub login
|
||||
run: |
|
||||
@@ -643,9 +638,6 @@ jobs:
|
||||
- name: Push compute tools image to Docker Hub
|
||||
run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push compute node image to Docker Hub
|
||||
run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push compute node v14 image to Docker Hub
|
||||
run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
@@ -662,7 +654,6 @@ jobs:
|
||||
run: |
|
||||
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
@@ -745,7 +736,7 @@ jobs:
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-new:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||
@@ -756,9 +747,80 @@ jobs:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ us-east-2 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
cd "$(pwd)/.github/ansible"
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
./get_binaries.sh
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
RELEASE=true ./get_binaries.sh
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-pr-test-new:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||
needs: [ push-docker-hub, tag, regress-tests ]
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ eu-west-1 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
cd "$(pwd)/.github/ansible"
|
||||
|
||||
./get_binaries.sh
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-prod-new:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||
needs: [ push-docker-hub, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -781,11 +843,11 @@ jobs:
|
||||
fi
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
|
||||
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-proxy:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
@@ -827,16 +889,23 @@ jobs:
|
||||
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
deploy-proxy-new:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
needs: [ push-docker-hub, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'main') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: dev-us-east-2-beta
|
||||
- target_region: eu-west-1
|
||||
target_cluster: dev-eu-west-1-zeta
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -847,15 +916,52 @@ jobs:
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region us-east-2 eks update-kubeconfig --name dev-us-east-2-beta --role-arn arn:aws:iam::369495373322:role/github-runner
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Re-deploy proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
promote-compatibility-test-snapshot:
|
||||
runs-on: dev
|
||||
deploy-proxy-prod-new:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: prod-us-east-2-delta
|
||||
- target_region: eu-central-1
|
||||
target_cluster: prod-eu-central-1-gamma
|
||||
- target_region: ap-southeast-1
|
||||
target_cluster: prod-ap-southeast-1-epsilon
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Re-deploy proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
promote-compatibility-data:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
@@ -868,9 +974,24 @@ jobs:
|
||||
BUCKET: neon-github-public-dev
|
||||
PREFIX: artifacts/latest
|
||||
run: |
|
||||
# Update compatibility snapshot for the release
|
||||
for build_type in debug release; do
|
||||
OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
|
||||
NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
|
||||
|
||||
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
|
||||
done
|
||||
|
||||
# Update Neon artifact for the release (reuse already uploaded artifact)
|
||||
for build_type in debug release; do
|
||||
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
|
||||
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
|
||||
done
|
||||
|
||||
4
.github/workflows/codestyle.yml
vendored
4
.github/workflows/codestyle.yml
vendored
@@ -106,7 +106,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
|
||||
key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
|
||||
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
@@ -115,7 +115,7 @@ jobs:
|
||||
run: cargo build --locked --all --all-targets
|
||||
|
||||
check-rust-dependencies:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
2
.gitmodules
vendored
2
.gitmodules
vendored
@@ -1,7 +1,7 @@
|
||||
[submodule "vendor/postgres-v14"]
|
||||
path = vendor/postgres-v14
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
branch = main
|
||||
branch = REL_14_STABLE_neon
|
||||
[submodule "vendor/postgres-v15"]
|
||||
path = vendor/postgres-v15
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
|
||||
11
CODEOWNERS
Normal file
11
CODEOWNERS
Normal file
@@ -0,0 +1,11 @@
|
||||
/compute_tools/ @neondatabase/control-plane
|
||||
/control_plane/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/pageserver/ @neondatabase/compute @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/proxy/ @neondatabase/control-plane
|
||||
/safekeeper/ @neondatabase/safekeepers
|
||||
/vendor/ @neondatabase/compute
|
||||
63
Cargo.lock
generated
63
Cargo.lock
generated
@@ -317,12 +317,6 @@ dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boxfnonce"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426"
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "1.0.1"
|
||||
@@ -600,6 +594,7 @@ dependencies = [
|
||||
"tar",
|
||||
"thiserror",
|
||||
"toml",
|
||||
"url",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -849,16 +844,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "daemonize"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815"
|
||||
dependencies = [
|
||||
"boxfnonce",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.14.1"
|
||||
@@ -2140,7 +2125,6 @@ dependencies = [
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"crossbeam-utils",
|
||||
"daemonize",
|
||||
"etcd_broker",
|
||||
"fail",
|
||||
"futures",
|
||||
@@ -2161,6 +2145,7 @@ dependencies = [
|
||||
"postgres-types",
|
||||
"postgres_ffi",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"rand",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
@@ -2173,6 +2158,7 @@ dependencies = [
|
||||
"svg_fmt",
|
||||
"tar",
|
||||
"tempfile",
|
||||
"tenant_size_model",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
@@ -2190,6 +2176,7 @@ name = "pageserver_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"const_format",
|
||||
"postgres_ffi",
|
||||
@@ -2268,6 +2255,14 @@ version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
|
||||
|
||||
[[package]]
|
||||
name = "persistent_range_query"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"rand",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "petgraph"
|
||||
version = "0.6.2"
|
||||
@@ -2452,6 +2447,21 @@ version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
|
||||
|
||||
[[package]]
|
||||
name = "pq_proto"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"rand",
|
||||
"serde",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prettyplease"
|
||||
version = "0.1.21"
|
||||
@@ -2584,6 +2594,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"pin-project-lite",
|
||||
"pq_proto",
|
||||
"rand",
|
||||
"rcgen",
|
||||
"reqwest",
|
||||
@@ -3087,7 +3098,6 @@ dependencies = [
|
||||
"clap 4.0.15",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"daemonize",
|
||||
"etcd_broker",
|
||||
"fs2",
|
||||
"git-version",
|
||||
@@ -3095,11 +3105,13 @@ dependencies = [
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"nix 0.25.0",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"safekeeper_api",
|
||||
@@ -3548,6 +3560,13 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tenant_size_model"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.1.3"
|
||||
@@ -4053,9 +4072,7 @@ dependencies = [
|
||||
"metrics",
|
||||
"nix 0.25.0",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"pq_proto",
|
||||
"rand",
|
||||
"routerify",
|
||||
"rustls",
|
||||
@@ -4380,6 +4397,9 @@ dependencies = [
|
||||
"crossbeam-utils",
|
||||
"either",
|
||||
"fail",
|
||||
"futures-channel",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
"hashbrown",
|
||||
"indexmap",
|
||||
"libc",
|
||||
@@ -4393,6 +4413,7 @@ dependencies = [
|
||||
"rand",
|
||||
"regex",
|
||||
"regex-syntax",
|
||||
"reqwest",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"stable_deref_trait",
|
||||
|
||||
@@ -25,6 +25,10 @@ members = [
|
||||
# Besides, debug info should not affect the performance.
|
||||
debug = true
|
||||
|
||||
# disable debug symbols for all packages except this one to decrease binaries size
|
||||
[profile.release.package."*"]
|
||||
debug = false
|
||||
|
||||
[profile.release-line-debug]
|
||||
inherits = "release"
|
||||
debug = 1 # true = 2 = all symbols, 1 = line only
|
||||
|
||||
@@ -13,7 +13,7 @@ ARG TAG=pinned
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -24,7 +24,7 @@ RUN apt update && \
|
||||
FROM build-deps AS pg-build
|
||||
COPY vendor/postgres-v14 postgres
|
||||
RUN cd postgres && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
|
||||
@@ -13,7 +13,7 @@ ARG TAG=pinned
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -24,7 +24,7 @@ RUN apt update && \
|
||||
FROM build-deps AS pg-build
|
||||
COPY vendor/postgres-v15 postgres
|
||||
RUN cd postgres && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
|
||||
@@ -1,88 +0,0 @@
|
||||
#
|
||||
# Legacy version of the Dockerfile for the compute node.
|
||||
# Used by e2e CI. Building Dockerfile.compute-node will take
|
||||
# unreasonable ammount of time without v2 runners.
|
||||
#
|
||||
# TODO: remove once cloud repo CI is moved to v2 runners.
|
||||
#
|
||||
|
||||
|
||||
# Allow specifiyng different compute-tools tag and image repo, so we are
|
||||
# able to use different images
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG IMAGE=compute-tools
|
||||
ARG TAG=latest
|
||||
|
||||
#
|
||||
# Image with pre-built tools
|
||||
#
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
|
||||
# Only to get ready compute_ctl binary as deppendency
|
||||
|
||||
#
|
||||
# Image with Postgres build deps
|
||||
#
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
|
||||
RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
||||
libcurl4-openssl-dev libossp-uuid-dev
|
||||
|
||||
#
|
||||
# Image with built Postgres
|
||||
#
|
||||
FROM build-deps AS pg-build
|
||||
|
||||
# Add user postgres
|
||||
RUN adduser postgres
|
||||
RUN mkdir /pg && chown postgres:postgres /pg
|
||||
|
||||
# Copy source files
|
||||
# version 14 is default for now
|
||||
COPY ./vendor/postgres-v14 /pg/
|
||||
COPY ./pgxn /pg/
|
||||
|
||||
# Build and install Postgres locally
|
||||
RUN mkdir /pg/compute_build && cd /pg/compute_build && \
|
||||
../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
|
||||
# Install main binaries and contribs
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
|
||||
|
||||
# Install neon contrib
|
||||
RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install
|
||||
|
||||
USER postgres
|
||||
WORKDIR /pg
|
||||
|
||||
#
|
||||
# Final compute node image to be exported
|
||||
#
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
# libreadline-dev is required to run psql
|
||||
RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
|
||||
|
||||
# Add user postgres
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute
|
||||
|
||||
# Copy ready Postgres binaries
|
||||
COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
|
||||
|
||||
# Copy binaries from compute-tools
|
||||
COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# XXX: temporary symlink for compatibility with old control-plane
|
||||
RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
|
||||
|
||||
# Add postgres shared objects to the search path
|
||||
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
|
||||
USER postgres
|
||||
|
||||
ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||
38
Makefile
38
Makefile
@@ -20,18 +20,18 @@ else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
|
||||
# Seccomp BPF is only available for Linux
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
# Seccomp BPF is only available for Linux
|
||||
PG_CONFIGURE_OPTS += --with-libseccomp
|
||||
endif
|
||||
|
||||
# macOS with brew-installed openssl requires explicit paths
|
||||
# It can be configured with OPENSSL_PREFIX variable
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
|
||||
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
|
||||
else ifeq ($(UNAME_S),Darwin)
|
||||
# macOS with brew-installed openssl requires explicit paths
|
||||
# It can be configured with OPENSSL_PREFIX variable
|
||||
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
|
||||
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
|
||||
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
|
||||
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
|
||||
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
|
||||
endif
|
||||
|
||||
# Use -C option so that when PostgreSQL "make install" installs the
|
||||
@@ -73,7 +73,8 @@ $(POSTGRES_INSTALL_DIR)/build/v14/config.status:
|
||||
+@echo "Configuring Postgres v14 build"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure CFLAGS='$(PG_CFLAGS)' \
|
||||
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \
|
||||
CFLAGS='$(PG_CFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) \
|
||||
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)
|
||||
|
||||
@@ -81,7 +82,8 @@ $(POSTGRES_INSTALL_DIR)/build/v15/config.status:
|
||||
+@echo "Configuring Postgres v15 build"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure CFLAGS='$(PG_CFLAGS)' \
|
||||
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \
|
||||
CFLAGS='$(PG_CFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) \
|
||||
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)
|
||||
|
||||
@@ -111,6 +113,8 @@ postgres-v14: postgres-v14-configure \
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
|
||||
+@echo "Compiling libpq v14"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
|
||||
+@echo "Compiling pg_prewarm v14"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install
|
||||
+@echo "Compiling pg_buffercache v14"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
|
||||
+@echo "Compiling pageinspect v14"
|
||||
@@ -123,6 +127,8 @@ postgres-v15: postgres-v15-configure \
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
|
||||
+@echo "Compiling libpq v15"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
|
||||
+@echo "Compiling pg_prewarm v15"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install
|
||||
+@echo "Compiling pg_buffercache v15"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
|
||||
+@echo "Compiling pageinspect v15"
|
||||
@@ -151,6 +157,11 @@ neon-pg-ext-v14: postgres-v14
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
|
||||
+@echo "Compiling neon_walredo v14"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
|
||||
+@echo "Compiling neon_test_utils" v14
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
|
||||
@@ -163,6 +174,11 @@ neon-pg-ext-v15: postgres-v15
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
|
||||
+@echo "Compiling neon_walredo v15"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
|
||||
+@echo "Compiling neon_test_utils" v15
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
|
||||
|
||||
28
README.md
28
README.md
@@ -53,7 +53,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
1. Install XCode and dependencies
|
||||
```
|
||||
xcode-select --install
|
||||
brew install protobuf etcd openssl
|
||||
brew install protobuf etcd openssl flex bison
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
@@ -125,24 +125,23 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
|
||||
# Create repository in .neon with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
> ./target/debug/neon_local init
|
||||
Starting pageserver at '127.0.0.1:64000' in '.neon'
|
||||
|
||||
Pageserver started
|
||||
Successfully initialized timeline 7dd0907914ac399ff3be45fb252bfdb7
|
||||
Stopping pageserver gracefully...done!
|
||||
Starting pageserver at '127.0.0.1:64000' in '.neon'.
|
||||
pageserver started, pid: 2545906
|
||||
Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
|
||||
Stopped pageserver 1 process with pid 2545906
|
||||
|
||||
# start pageserver and safekeeper
|
||||
> ./target/debug/neon_local start
|
||||
Starting etcd broker using /usr/bin/etcd
|
||||
Starting pageserver at '127.0.0.1:64000' in '.neon'
|
||||
|
||||
Pageserver started
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'
|
||||
Safekeeper started
|
||||
Starting etcd broker using "/usr/bin/etcd"
|
||||
etcd started, pid: 2545996
|
||||
Starting pageserver at '127.0.0.1:64000' in '.neon'.
|
||||
pageserver started, pid: 2546005
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
|
||||
safekeeper 1 started, pid: 2546041
|
||||
|
||||
# start postgres compute node
|
||||
> ./target/debug/neon_local pg start main
|
||||
Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||
Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
|
||||
Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
|
||||
|
||||
@@ -223,10 +222,7 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
|
||||
# either:
|
||||
CARGO_BUILD_FLAGS="--features=testing" make
|
||||
# or:
|
||||
make debug
|
||||
|
||||
./scripts/pytest
|
||||
```
|
||||
|
||||
188
cli-v2-story.md
188
cli-v2-story.md
@@ -1,188 +0,0 @@
|
||||
Create a new Zenith repository in the current directory:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli init
|
||||
The files belonging to this database system will be owned by user "heikki".
|
||||
This user must also own the server process.
|
||||
|
||||
The database cluster will be initialized with locale "en_GB.UTF-8".
|
||||
The default database encoding has accordingly been set to "UTF8".
|
||||
The default text search configuration will be set to "english".
|
||||
|
||||
Data page checksums are disabled.
|
||||
|
||||
creating directory tmp ... ok
|
||||
creating subdirectories ... ok
|
||||
selecting dynamic shared memory implementation ... posix
|
||||
selecting default max_connections ... 100
|
||||
selecting default shared_buffers ... 128MB
|
||||
selecting default time zone ... Europe/Helsinki
|
||||
creating configuration files ... ok
|
||||
running bootstrap script ... ok
|
||||
performing post-bootstrap initialization ... ok
|
||||
syncing data to disk ... ok
|
||||
|
||||
initdb: warning: enabling "trust" authentication for local connections
|
||||
You can change this by editing pg_hba.conf or using the option -A, or
|
||||
--auth-local and --auth-host, the next time you run initdb.
|
||||
new zenith repository was created in .zenith
|
||||
|
||||
Initially, there is only one branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
|
||||
main
|
||||
|
||||
Start a local Postgres instance on the branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start main
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:27:43.919 EEST [984664] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv6 address "::1", port 5432
|
||||
2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv4 address "127.0.0.1", port 5432
|
||||
2021-04-13 09:27:43.927 EEST [984664] LOG: listening on Unix socket "/tmp/.s.PGSQL.5432"
|
||||
2021-04-13 09:27:43.939 EEST [984665] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:27:43.939 EEST [984665] LOG: creating missing WAL directory "pg_wal/archive_status"
|
||||
2021-04-13 09:27:44.189 EEST [984665] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:27:44.195 EEST [984665] LOG: invalid record length at 0/15FFB80: wanted 24, got 0
|
||||
2021-04-13 09:27:44.195 EEST [984665] LOG: redo is not required
|
||||
2021-04-13 09:27:44.225 EEST [984664] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
|
||||
Run some commands against it:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "create table foo (t text);"
|
||||
CREATE TABLE
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "insert into foo values ('inserted on the main branch');"
|
||||
INSERT 0 1
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
Create a new branch called 'experimental'. We create it from the
|
||||
current end of the 'main' branch, but you could specify a different
|
||||
LSN as the start point instead.
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch experimental main
|
||||
branching at end of WAL: 0/161F478
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
|
||||
experimental
|
||||
main
|
||||
|
||||
Start another Postgres instance off the 'experimental' branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:28:41.874 EEST [984766] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv6 address "::1", port 5433
|
||||
2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv4 address "127.0.0.1", port 5433
|
||||
2021-04-13 09:28:41.883 EEST [984766] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433"
|
||||
2021-04-13 09:28:41.896 EEST [984767] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:28:42.265 EEST [984767] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:28:42.269 EEST [984767] LOG: redo starts at 0/15FFB80
|
||||
2021-04-13 09:28:42.272 EEST [984767] LOG: invalid record length at 0/161F4B0: wanted 24, got 0
|
||||
2021-04-13 09:28:42.272 EEST [984767] LOG: redo done at 0/161F478 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
|
||||
2021-04-13 09:28:42.321 EEST [984766] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
|
||||
Insert some a row on the 'experimental' branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "insert into foo values ('inserted on experimental')"
|
||||
INSERT 0 1
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
inserted on experimental
|
||||
(2 rows)
|
||||
|
||||
See that the other Postgres instance is still running on 'main' branch on port 5432:
|
||||
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5432 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
|
||||
|
||||
|
||||
Everything is stored in the .zenith directory:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/
|
||||
total 12
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 datadirs
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 refs
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 timelines
|
||||
|
||||
The 'datadirs' directory contains the datadirs of the running instances:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/
|
||||
total 8
|
||||
drwx------ 18 heikki heikki 4096 Apr 13 09:27 3c0c634c1674079b2c6d4edf7c91523e
|
||||
drwx------ 18 heikki heikki 4096 Apr 13 09:28 697e3c103d4b1763cd6e82e4ff361d76
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/3c0c634c1674079b2c6d4edf7c91523e/
|
||||
total 124
|
||||
drwxr-xr-x 5 heikki heikki 4096 Apr 13 09:27 base
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 global
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_commit_ts
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_dynshmem
|
||||
-rw------- 1 heikki heikki 4760 Apr 13 09:27 pg_hba.conf
|
||||
-rw------- 1 heikki heikki 1636 Apr 13 09:27 pg_ident.conf
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:32 pg_logical
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 pg_multixact
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_notify
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_replslot
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_serial
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_snapshots
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_stat
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:34 pg_stat_tmp
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_subtrans
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_tblspc
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_twophase
|
||||
-rw------- 1 heikki heikki 3 Apr 13 09:27 PG_VERSION
|
||||
lrwxrwxrwx 1 heikki heikki 52 Apr 13 09:27 pg_wal -> ../../timelines/3c0c634c1674079b2c6d4edf7c91523e/wal
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_xact
|
||||
-rw------- 1 heikki heikki 88 Apr 13 09:27 postgresql.auto.conf
|
||||
-rw------- 1 heikki heikki 28688 Apr 13 09:27 postgresql.conf
|
||||
-rw------- 1 heikki heikki 96 Apr 13 09:27 postmaster.opts
|
||||
-rw------- 1 heikki heikki 149 Apr 13 09:27 postmaster.pid
|
||||
|
||||
Note how 'pg_wal' is just a symlink to the 'timelines' directory. The
|
||||
datadir is ephemeral, you can delete it at any time, and it can be reconstructed
|
||||
from the snapshots and WAL stored in the 'timelines' directory. So if you push/pull
|
||||
the repository, the 'datadirs' are not included. (They are like git working trees)
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ killall -9 postgres
|
||||
~/git-sandbox/zenith (cli-v2)$ rm -rf .zenith/datadirs/*
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:37:05.476 EEST [985340] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv6 address "::1", port 5433
|
||||
2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv4 address "127.0.0.1", port 5433
|
||||
2021-04-13 09:37:05.487 EEST [985340] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433"
|
||||
2021-04-13 09:37:05.498 EEST [985341] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:37:05.808 EEST [985341] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:37:05.813 EEST [985341] LOG: redo starts at 0/15FFB80
|
||||
2021-04-13 09:37:05.815 EEST [985341] LOG: invalid record length at 0/161F770: wanted 24, got 0
|
||||
2021-04-13 09:37:05.815 EEST [985341] LOG: redo done at 0/161F738 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
|
||||
2021-04-13 09:37:05.866 EEST [985340] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
inserted on experimental
|
||||
(2 rows)
|
||||
|
||||
@@ -65,7 +65,7 @@ impl GenericOption {
|
||||
let name = match self.name.as_str() {
|
||||
"safekeepers" => "neon.safekeepers",
|
||||
"wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
|
||||
"wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout",
|
||||
"wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout",
|
||||
it => it,
|
||||
};
|
||||
|
||||
|
||||
@@ -4,20 +4,21 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
clap = "4.0"
|
||||
comfy-table = "6.1"
|
||||
git-version = "0.3.5"
|
||||
tar = "0.4.38"
|
||||
nix = "0.25"
|
||||
once_cell = "1.13.0"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
regex = "1"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "2.0"
|
||||
toml = "0.5"
|
||||
once_cell = "1.13.0"
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
tar = "0.4.38"
|
||||
thiserror = "1"
|
||||
nix = "0.25"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
toml = "0.5"
|
||||
url = "2.2.2"
|
||||
|
||||
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||
# instead, so that recompile times are better.
|
||||
|
||||
279
control_plane/src/background_process.rs
Normal file
279
control_plane/src/background_process.rs
Normal file
@@ -0,0 +1,279 @@
|
||||
//! Spawns and kills background processes that are needed by Neon CLI.
|
||||
//! Applies common set-up such as log and pid files (if needed) to every process.
|
||||
//!
|
||||
//! Neon CLI does not run in background, so it needs to store the information about
|
||||
//! spawned processes, which it does in this module.
|
||||
//! We do that by storing the pid of the process in the "${process_name}.pid" file.
|
||||
//! The pid file can be created by the process itself
|
||||
//! (Neon storage binaries do that and also ensure that a lock is taken onto that file)
|
||||
//! or we create such file after starting the process
|
||||
//! (non-Neon binaries don't necessarily follow our pidfile conventions).
|
||||
//! The pid stored in the file is later used to stop the service.
|
||||
//!
|
||||
//! See [`lock_file`] module for more info.
|
||||
|
||||
use std::ffi::OsStr;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::process::{Child, Command};
|
||||
use std::time::Duration;
|
||||
use std::{fs, io, thread};
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
|
||||
use utils::lock_file;
|
||||
|
||||
// These constants control the loop used to poll for process start / stop.
|
||||
//
|
||||
// The loop waits for at most 10 seconds, polling every 100 ms.
|
||||
// Once a second, it prints a dot ("."), to give the user an indication that
|
||||
// it's waiting. If the process hasn't started/stopped after 5 seconds,
|
||||
// it prints a notice that it's taking long, but keeps waiting.
|
||||
//
|
||||
const RETRY_UNTIL_SECS: u64 = 10;
|
||||
const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
|
||||
const RETRY_INTERVAL_MILLIS: u64 = 100;
|
||||
const DOT_EVERY_RETRIES: u64 = 10;
|
||||
const NOTICE_AFTER_RETRIES: u64 = 50;
|
||||
|
||||
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
|
||||
/// it itself.
|
||||
pub enum InitialPidFile<'t> {
|
||||
/// Create a pidfile, to allow future CLI invocations to manipulate the process.
|
||||
Create(&'t Path),
|
||||
/// The process will create the pidfile itself, need to wait for that event.
|
||||
Expect(&'t Path),
|
||||
}
|
||||
|
||||
/// Start a background child process using the parameters given.
|
||||
pub fn start_process<F, S: AsRef<OsStr>>(
|
||||
process_name: &str,
|
||||
datadir: &Path,
|
||||
command: &Path,
|
||||
args: &[S],
|
||||
initial_pid_file: InitialPidFile,
|
||||
process_status_check: F,
|
||||
) -> anyhow::Result<Child>
|
||||
where
|
||||
F: Fn() -> anyhow::Result<bool>,
|
||||
{
|
||||
let log_path = datadir.join(format!("{process_name}.log"));
|
||||
let process_log_file = fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.append(true)
|
||||
.open(&log_path)
|
||||
.with_context(|| {
|
||||
format!("Could not open {process_name} log file {log_path:?} for writing")
|
||||
})?;
|
||||
let same_file_for_stderr = process_log_file.try_clone().with_context(|| {
|
||||
format!("Could not reuse {process_name} log file {log_path:?} for writing stderr")
|
||||
})?;
|
||||
|
||||
let mut command = Command::new(command);
|
||||
let background_command = command
|
||||
.stdout(process_log_file)
|
||||
.stderr(same_file_for_stderr)
|
||||
.args(args);
|
||||
let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
|
||||
|
||||
let mut spawned_process = filled_cmd.spawn().with_context(|| {
|
||||
format!("Could not spawn {process_name}, see console output and log files for details.")
|
||||
})?;
|
||||
let pid = spawned_process.id();
|
||||
let pid = Pid::from_raw(
|
||||
i32::try_from(pid)
|
||||
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
|
||||
);
|
||||
|
||||
let pid_file_to_check = match initial_pid_file {
|
||||
InitialPidFile::Create(target_pid_file_path) => {
|
||||
match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
|
||||
lock_file::LockCreationResult::Created { .. } => {
|
||||
// We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
|
||||
// as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
|
||||
}
|
||||
lock_file::LockCreationResult::AlreadyLocked { .. } => {
|
||||
anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
|
||||
}
|
||||
lock_file::LockCreationResult::CreationFailed(e) => {
|
||||
return Err(e.context(format!(
|
||||
"Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
|
||||
)))
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
|
||||
};
|
||||
|
||||
for retries in 0..RETRIES {
|
||||
match process_started(pid, pid_file_to_check, &process_status_check) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} started, pid: {pid}");
|
||||
return Ok(spawned_process);
|
||||
}
|
||||
Ok(false) => {
|
||||
if retries == NOTICE_AFTER_RETRIES {
|
||||
// The process is taking a long time to start up. Keep waiting, but
|
||||
// print a message
|
||||
print!("\n{process_name} has not started yet, continuing to wait");
|
||||
}
|
||||
if retries % DOT_EVERY_RETRIES == 0 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{process_name} failed to start: {e:#}");
|
||||
if let Err(e) = spawned_process.kill() {
|
||||
println!("Could not stop {process_name} subprocess: {e:#}")
|
||||
};
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
println!();
|
||||
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
|
||||
if !pid_file.exists() {
|
||||
println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
|
||||
return Ok(());
|
||||
}
|
||||
let pid = read_pidfile(pid_file)?;
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping {process_name} with pid {pid} immediately..");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping {process_name} with pid {pid} gracefully..");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(()) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!(
|
||||
"{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"),
|
||||
}
|
||||
|
||||
// Wait until process is gone
|
||||
for retries in 0..RETRIES {
|
||||
match process_has_stopped(pid) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} stopped");
|
||||
if let Err(e) = fs::remove_file(pid_file) {
|
||||
if e.kind() != io::ErrorKind::NotFound {
|
||||
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
Ok(false) => {
|
||||
if retries == NOTICE_AFTER_RETRIES {
|
||||
// The process is taking a long time to start up. Keep waiting, but
|
||||
// print a message
|
||||
print!("\n{process_name} has not stopped yet, continuing to wait");
|
||||
}
|
||||
if retries % DOT_EVERY_RETRIES == 0 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{process_name} with pid {pid} failed to stop: {e:#}");
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
println!();
|
||||
anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
|
||||
}
|
||||
|
||||
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
|
||||
|
||||
// Pass through these environment variables to the command
|
||||
for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
filled_cmd = filled_cmd.env(var, val);
|
||||
}
|
||||
}
|
||||
|
||||
filled_cmd
|
||||
}
|
||||
|
||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in [
|
||||
"AWS_ACCESS_KEY_ID",
|
||||
"AWS_SECRET_ACCESS_KEY",
|
||||
"AWS_SESSION_TOKEN",
|
||||
] {
|
||||
if let Ok(value) = std::env::var(env_key) {
|
||||
cmd = cmd.env(env_key, value);
|
||||
}
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
fn process_started<F>(
|
||||
pid: Pid,
|
||||
pid_file_to_check: Option<&Path>,
|
||||
status_check: &F,
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
F: Fn() -> anyhow::Result<bool>,
|
||||
{
|
||||
match status_check() {
|
||||
Ok(true) => match pid_file_to_check {
|
||||
Some(pid_file_path) => {
|
||||
if pid_file_path.exists() {
|
||||
let pid_in_file = read_pidfile(pid_file_path)?;
|
||||
Ok(pid_in_file == pid)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
None => Ok(true),
|
||||
},
|
||||
Ok(false) => Ok(false),
|
||||
Err(e) => anyhow::bail!("process failed to start: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// We expect a file that contains a single integer.
|
||||
fn read_pidfile(pidfile: &Path) -> Result<Pid> {
|
||||
let pid_str = fs::read_to_string(pidfile)
|
||||
.with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
|
||||
let pid: i32 = pid_str
|
||||
.parse()
|
||||
.map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
|
||||
if pid < 1 {
|
||||
bail!("pidfile {pidfile:?} contained bad value '{pid}'");
|
||||
}
|
||||
Ok(Pid::from_raw(pid))
|
||||
}
|
||||
|
||||
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||
match kill(pid, None) {
|
||||
// Process exists, keep waiting
|
||||
Ok(_) => Ok(false),
|
||||
// Process not found, we're done
|
||||
Err(Errno::ESRCH) => Ok(true),
|
||||
Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"),
|
||||
}
|
||||
}
|
||||
@@ -9,8 +9,8 @@ use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use control_plane::compute::ComputeControlPlane;
|
||||
use control_plane::local_env::{EtcdBroker, LocalEnv};
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage::PageServerNode;
|
||||
use control_plane::{etcd, local_env};
|
||||
use pageserver_api::models::TimelineInfo;
|
||||
use pageserver_api::{
|
||||
|
||||
@@ -12,15 +12,14 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use utils::{
|
||||
connstring::connection_host_port,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
|
||||
use crate::pageserver::PageServerNode;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage::PageServerNode;
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
@@ -300,7 +299,8 @@ impl PostgresNode {
|
||||
|
||||
// Configure the node to fetch pages from pageserver
|
||||
let pageserver_connstr = {
|
||||
let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
|
||||
let config = &self.pageserver.pg_connection_config;
|
||||
let (host, port) = (config.host(), config.port());
|
||||
|
||||
// Set up authentication
|
||||
//
|
||||
@@ -343,7 +343,7 @@ impl PostgresNode {
|
||||
// To be able to restore database in case of pageserver node crash, safekeeper should not
|
||||
// remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
|
||||
// (if they are not able to upload WAL to S3).
|
||||
conf.append("max_replication_write_lag", "500MB");
|
||||
conf.append("max_replication_write_lag", "15MB");
|
||||
conf.append("max_replication_flush_lag", "10GB");
|
||||
|
||||
if !self.env.safekeepers.is_empty() {
|
||||
|
||||
57
control_plane/src/connection.rs
Normal file
57
control_plane/src/connection.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PgConnectionConfig {
|
||||
url: Url,
|
||||
}
|
||||
|
||||
impl PgConnectionConfig {
|
||||
pub fn host(&self) -> &str {
|
||||
self.url.host_str().expect("BUG: no host")
|
||||
}
|
||||
|
||||
pub fn port(&self) -> u16 {
|
||||
self.url.port().expect("BUG: no port")
|
||||
}
|
||||
|
||||
/// Return a `<host>:<port>` string.
|
||||
pub fn raw_address(&self) -> String {
|
||||
format!("{}:{}", self.host(), self.port())
|
||||
}
|
||||
|
||||
/// Connect using postgres protocol with TLS disabled.
|
||||
pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
|
||||
postgres::Client::connect(self.url.as_str(), postgres::NoTls)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for PgConnectionConfig {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let mut url: Url = s.parse()?;
|
||||
|
||||
match url.scheme() {
|
||||
"postgres" | "postgresql" => {}
|
||||
other => anyhow::bail!("invalid scheme: {other}"),
|
||||
}
|
||||
|
||||
// It's not a valid connection url if host is unavailable.
|
||||
if url.host().is_none() {
|
||||
anyhow::bail!(url::ParseError::EmptyHost);
|
||||
}
|
||||
|
||||
// E.g. `postgres:bar`.
|
||||
if url.cannot_be_a_base() {
|
||||
anyhow::bail!("URL cannot be a base");
|
||||
}
|
||||
|
||||
// Set the default PG port if it's missing.
|
||||
if url.port().is_none() {
|
||||
url.set_port(Some(5432))
|
||||
.expect("BUG: couldn't set the default port");
|
||||
}
|
||||
|
||||
Ok(Self { url })
|
||||
}
|
||||
}
|
||||
@@ -1,99 +1,75 @@
|
||||
use std::{
|
||||
fs,
|
||||
path::PathBuf,
|
||||
process::{Command, Stdio},
|
||||
};
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
use anyhow::Context;
|
||||
use nix::{
|
||||
sys::signal::{kill, Signal},
|
||||
unistd::Pid,
|
||||
};
|
||||
|
||||
use crate::{local_env, read_pidfile};
|
||||
use crate::{background_process, local_env};
|
||||
|
||||
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_broker = &env.etcd_broker;
|
||||
println!(
|
||||
"Starting etcd broker using {}",
|
||||
etcd_broker.etcd_binary_path.display()
|
||||
print!(
|
||||
"Starting etcd broker using {:?}",
|
||||
etcd_broker.etcd_binary_path
|
||||
);
|
||||
|
||||
let etcd_data_dir = env.base_data_dir.join("etcd");
|
||||
fs::create_dir_all(&etcd_data_dir).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd data dir: {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
fs::create_dir_all(&etcd_data_dir)
|
||||
.with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;
|
||||
|
||||
let etcd_stdout_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd stout file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let etcd_stderr_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd stderr file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let client_urls = etcd_broker.comma_separated_endpoints();
|
||||
let args = [
|
||||
format!("--data-dir={}", etcd_data_dir.display()),
|
||||
format!("--listen-client-urls={client_urls}"),
|
||||
format!("--advertise-client-urls={client_urls}"),
|
||||
// Set --quota-backend-bytes to keep the etcd virtual memory
|
||||
// size smaller. Our test etcd clusters are very small.
|
||||
// See https://github.com/etcd-io/etcd/issues/7910
|
||||
"--quota-backend-bytes=100000000".to_string(),
|
||||
// etcd doesn't compact (vacuum) with default settings,
|
||||
// enable it to prevent space exhaustion.
|
||||
"--auto-compaction-mode=revision".to_string(),
|
||||
"--auto-compaction-retention=1".to_string(),
|
||||
];
|
||||
|
||||
let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
|
||||
.args(&[
|
||||
format!("--data-dir={}", etcd_data_dir.display()),
|
||||
format!("--listen-client-urls={client_urls}"),
|
||||
format!("--advertise-client-urls={client_urls}"),
|
||||
// Set --quota-backend-bytes to keep the etcd virtual memory
|
||||
// size smaller. Our test etcd clusters are very small.
|
||||
// See https://github.com/etcd-io/etcd/issues/7910
|
||||
"--quota-backend-bytes=100000000".to_string(),
|
||||
// etcd doesn't compact (vacuum) with default settings,
|
||||
// enable it to prevent space exhaustion.
|
||||
"--auto-compaction-mode=revision".to_string(),
|
||||
"--auto-compaction-retention=1".to_string(),
|
||||
])
|
||||
.stdout(Stdio::from(etcd_stdout_file))
|
||||
.stderr(Stdio::from(etcd_stderr_file))
|
||||
.spawn()
|
||||
.context("Failed to spawn etcd subprocess")?;
|
||||
let pid = etcd_process.id();
|
||||
let pid_file_path = etcd_pid_file_path(env);
|
||||
|
||||
let etcd_pid_file_path = etcd_pid_file_path(env);
|
||||
fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd pid file at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?;
|
||||
let client = reqwest::blocking::Client::new();
|
||||
|
||||
background_process::start_process(
|
||||
"etcd",
|
||||
&etcd_data_dir,
|
||||
&etcd_broker.etcd_binary_path,
|
||||
&args,
|
||||
background_process::InitialPidFile::Create(&pid_file_path),
|
||||
|| {
|
||||
for broker_endpoint in &etcd_broker.broker_endpoints {
|
||||
let request = broker_endpoint
|
||||
.join("health")
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to append /health path to broker endopint {}",
|
||||
broker_endpoint
|
||||
)
|
||||
})
|
||||
.and_then(|url| {
|
||||
client.get(&url.to_string()).build().with_context(|| {
|
||||
format!("Failed to construct request to etcd endpoint {url}")
|
||||
})
|
||||
})?;
|
||||
if client.execute(request).is_ok() {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
},
|
||||
)
|
||||
.context("Failed to spawn etcd subprocess")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_path = &env.etcd_broker.etcd_binary_path;
|
||||
println!("Stopping etcd broker at {}", etcd_path.display());
|
||||
|
||||
let etcd_pid_file_path = etcd_pid_file_path(env);
|
||||
let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read etcd pid file at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?);
|
||||
|
||||
kill(pid, Signal::SIGTERM).with_context(|| {
|
||||
format!(
|
||||
"Failed to stop etcd with pid {pid} at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
|
||||
}
|
||||
|
||||
fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
|
||||
|
||||
@@ -6,59 +6,12 @@
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
|
||||
mod background_process;
|
||||
pub mod compute;
|
||||
pub mod connection;
|
||||
pub mod etcd;
|
||||
pub mod local_env;
|
||||
pub mod pageserver;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
pub mod storage;
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// We expect a file that contains a single integer.
|
||||
/// We return an i32 for compatibility with libc and nix.
|
||||
pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
|
||||
let pid_str = fs::read_to_string(pidfile)
|
||||
.with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
|
||||
let pid: i32 = pid_str
|
||||
.parse()
|
||||
.map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
|
||||
if pid < 1 {
|
||||
bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
|
||||
}
|
||||
Ok(pid)
|
||||
}
|
||||
|
||||
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
|
||||
|
||||
let var = "LLVM_PROFILE_FILE";
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
cmd.env(var, val);
|
||||
}
|
||||
|
||||
const RUST_LOG_KEY: &str = "RUST_LOG";
|
||||
if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
|
||||
cmd.env(RUST_LOG_KEY, rust_log_value)
|
||||
} else {
|
||||
cmd
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in [
|
||||
"AWS_ACCESS_KEY_ID",
|
||||
"AWS_SECRET_ACCESS_KEY",
|
||||
"AWS_SESSION_TOKEN",
|
||||
] {
|
||||
if let Ok(value) = std::env::var(env_key) {
|
||||
cmd = cmd.env(env_key, value);
|
||||
}
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
@@ -226,12 +226,12 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
|
||||
Ok(self.neon_distrib_dir.join("pageserver"))
|
||||
pub fn pageserver_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("pageserver")
|
||||
}
|
||||
|
||||
pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
|
||||
Ok(self.neon_distrib_dir.join("safekeeper"))
|
||||
pub fn safekeeper_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("safekeeper")
|
||||
}
|
||||
|
||||
pub fn pg_data_dirs_path(&self) -> PathBuf {
|
||||
|
||||
@@ -1,33 +1,27 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{BufReader, Write};
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
use std::{io, result, thread};
|
||||
use std::process::Child;
|
||||
use std::{io, result};
|
||||
|
||||
use crate::connection::PgConnectionConfig;
|
||||
use anyhow::{bail, Context};
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use postgres::{Config, NoTls};
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverHttpError {
|
||||
@@ -75,7 +69,7 @@ impl ResponseErrorMessageExt for Response {
|
||||
//
|
||||
#[derive(Debug)]
|
||||
pub struct PageServerNode {
|
||||
pub pg_connection_config: Config,
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: Client,
|
||||
pub http_base_url: String,
|
||||
@@ -101,7 +95,7 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
/// Construct libpq connection string for connecting to the pageserver.
|
||||
fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
|
||||
fn pageserver_connection_config(password: &str, listen_addr: &str) -> PgConnectionConfig {
|
||||
format!("postgresql://no_user:{password}@{listen_addr}/no_db")
|
||||
.parse()
|
||||
.unwrap()
|
||||
@@ -161,7 +155,15 @@ impl PageServerNode {
|
||||
init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
|
||||
}
|
||||
|
||||
self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?;
|
||||
let mut pageserver_process = self
|
||||
.start_node(&init_config_overrides, &self.env.base_data_dir, true)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to start a process for pageserver {}",
|
||||
self.env.pageserver.id,
|
||||
)
|
||||
})?;
|
||||
|
||||
let init_result = self
|
||||
.try_init_timeline(create_tenant, initial_timeline_id, pg_version)
|
||||
.context("Failed to create initial tenant and timeline for pageserver");
|
||||
@@ -171,7 +173,29 @@ impl PageServerNode {
|
||||
}
|
||||
Err(e) => eprintln!("{e:#}"),
|
||||
}
|
||||
self.stop(false)?;
|
||||
match pageserver_process.kill() {
|
||||
Err(e) => {
|
||||
eprintln!(
|
||||
"Failed to stop pageserver {} process with pid {}: {e:#}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
)
|
||||
}
|
||||
Ok(()) => {
|
||||
println!(
|
||||
"Stopped pageserver {} process with pid {}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
);
|
||||
// cleanup after pageserver startup, since we do not call regular `stop_process` during init
|
||||
let pid_file = self.pid_file();
|
||||
if let Err(e) = fs::remove_file(&pid_file) {
|
||||
if e.kind() != io::ErrorKind::NotFound {
|
||||
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
init_result
|
||||
}
|
||||
|
||||
@@ -196,11 +220,14 @@ impl PageServerNode {
|
||||
self.env.pageserver_data_dir()
|
||||
}
|
||||
|
||||
pub fn pid_file(&self) -> PathBuf {
|
||||
/// The pid file is created by the pageserver process, with its pid stored inside.
|
||||
/// Other pageservers cannot lock the same file and overwrite it for as long as the current
|
||||
/// pageserver runs. (Unless someone removes the file manually; never do that!)
|
||||
fn pid_file(&self) -> PathBuf {
|
||||
self.repo_path().join("pageserver.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
|
||||
self.start_node(config_overrides, &self.repo_path(), false)
|
||||
}
|
||||
|
||||
@@ -209,10 +236,10 @@ impl PageServerNode {
|
||||
config_overrides: &[&str],
|
||||
datadir: &Path,
|
||||
update_config: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
println!(
|
||||
) -> anyhow::Result<Child> {
|
||||
print!(
|
||||
"Starting pageserver at '{}' in '{}'",
|
||||
connection_address(&self.pg_connection_config),
|
||||
self.pg_connection_config.raw_address(),
|
||||
datadir.display()
|
||||
);
|
||||
io::stdout().flush()?;
|
||||
@@ -220,10 +247,7 @@ impl PageServerNode {
|
||||
let mut args = vec![
|
||||
"-D",
|
||||
datadir.to_str().with_context(|| {
|
||||
format!(
|
||||
"Datadir path '{}' cannot be represented as a unicode string",
|
||||
datadir.display()
|
||||
)
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
];
|
||||
|
||||
@@ -235,48 +259,18 @@ impl PageServerNode {
|
||||
args.extend(["-c", config_override]);
|
||||
}
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
|
||||
filled_cmd = fill_aws_secrets_vars(filled_cmd);
|
||||
|
||||
if !filled_cmd.status()?.success() {
|
||||
bail!(
|
||||
"Pageserver failed to start. See console output and '{}' for details.",
|
||||
datadir.join("pageserver.log").display()
|
||||
);
|
||||
}
|
||||
|
||||
// It takes a while for the page server to start up. Wait until it is
|
||||
// open for business.
|
||||
const RETRIES: i8 = 15;
|
||||
for retries in 1..RETRIES {
|
||||
match self.check_status() {
|
||||
Ok(()) => {
|
||||
println!("\nPageserver started");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => {
|
||||
match err {
|
||||
PageserverHttpError::Transport(err) => {
|
||||
if err.is_connect() && retries < 5 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
} else {
|
||||
if retries == 5 {
|
||||
println!() // put a line break after dots for second message
|
||||
}
|
||||
println!("Pageserver not responding yet, err {err} retrying ({retries})...");
|
||||
}
|
||||
}
|
||||
PageserverHttpError::Response(msg) => {
|
||||
bail!("pageserver failed to start: {msg} ")
|
||||
}
|
||||
}
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("pageserver failed to start in {RETRIES} seconds");
|
||||
background_process::start_process(
|
||||
"pageserver",
|
||||
datadir,
|
||||
&self.env.pageserver_bin(),
|
||||
&args,
|
||||
background_process::InitialPidFile::Expect(&self.pid_file()),
|
||||
|| match self.check_status() {
|
||||
Ok(()) => Ok(true),
|
||||
Err(PageserverHttpError::Transport(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -288,69 +282,18 @@ impl PageServerNode {
|
||||
/// If the server is not running, returns success
|
||||
///
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
let pid_file = self.pid_file();
|
||||
if !pid_file.exists() {
|
||||
println!("Pageserver is already stopped");
|
||||
return Ok(());
|
||||
}
|
||||
let pid = Pid::from_raw(read_pidfile(&pid_file)?);
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping pageserver immediately..");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping pageserver gracefully..");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!("Pageserver with pid {pid} does not exist, but a PID file was found");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to pageserver with pid {pid}: {}",
|
||||
err.desc()
|
||||
),
|
||||
}
|
||||
|
||||
// Wait until process is gone
|
||||
for i in 0..600 {
|
||||
let signal = None; // Send no signal, just get the error code
|
||||
match kill(pid, signal) {
|
||||
Ok(_) => (), // Process exists, keep waiting
|
||||
Err(Errno::ESRCH) => {
|
||||
// Process not found, we're done
|
||||
println!("done!");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to pageserver with pid {}: {}",
|
||||
pid,
|
||||
err.desc()
|
||||
),
|
||||
};
|
||||
|
||||
if i % 10 == 0 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {pid}");
|
||||
background_process::stop_process(immediate, "pageserver", &self.pid_file())
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
|
||||
let mut client = self.pg_connection_config.connect_no_tls().unwrap();
|
||||
|
||||
println!("Pageserver query: '{sql}'");
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
|
||||
pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
|
||||
self.pg_connection_config.connect(NoTls)
|
||||
self.pg_connection_config.connect_no_tls()
|
||||
}
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
@@ -419,6 +362,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<NonZeroU64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
||||
trace_read_requests: settings
|
||||
.remove("trace_read_requests")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -481,6 +429,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<NonZeroU64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
||||
trace_read_requests: settings
|
||||
.get("trace_read_requests")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
@@ -549,7 +502,7 @@ impl PageServerNode {
|
||||
pg_wal: Option<(Lsn, PathBuf)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
|
||||
let mut client = self.pg_connection_config.connect_no_tls().unwrap();
|
||||
|
||||
// Init base reader
|
||||
let (start_lsn, base_tarfile_path) = base;
|
||||
@@ -1,23 +1,21 @@
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::process::Child;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{io, result, thread};
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::bail;
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use postgres::Config;
|
||||
use anyhow::Context;
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};
|
||||
use utils::{http::error::HttpErrorBody, id::NodeId};
|
||||
|
||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||
use crate::storage::PageServerNode;
|
||||
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
|
||||
use crate::connection::PgConnectionConfig;
|
||||
use crate::pageserver::PageServerNode;
|
||||
use crate::{
|
||||
background_process,
|
||||
local_env::{LocalEnv, SafekeeperConf},
|
||||
};
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SafekeeperHttpError {
|
||||
@@ -63,7 +61,7 @@ pub struct SafekeeperNode {
|
||||
|
||||
pub conf: SafekeeperConf,
|
||||
|
||||
pub pg_connection_config: Config,
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: Client,
|
||||
pub http_base_url: String,
|
||||
@@ -87,15 +85,15 @@ impl SafekeeperNode {
|
||||
}
|
||||
|
||||
/// Construct libpq connection string for connecting to this safekeeper.
|
||||
fn safekeeper_connection_config(port: u16) -> Config {
|
||||
fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
|
||||
// TODO safekeeper authentication not implemented yet
|
||||
format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
|
||||
format!("postgresql://no_user@127.0.0.1:{port}/no_db")
|
||||
.parse()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
|
||||
env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
|
||||
env.safekeeper_data_dir(&format!("sk{sk_id}"))
|
||||
}
|
||||
|
||||
pub fn datadir_path(&self) -> PathBuf {
|
||||
@@ -106,91 +104,78 @@ impl SafekeeperNode {
|
||||
self.datadir_path().join("safekeeper.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> anyhow::Result<()> {
|
||||
pub fn start(&self) -> anyhow::Result<Child> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
connection_address(&self.pg_connection_config),
|
||||
self.pg_connection_config.raw_address(),
|
||||
self.datadir_path().display()
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
|
||||
let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
|
||||
let id = self.id;
|
||||
let datadir = self.datadir_path();
|
||||
|
||||
let mut cmd = Command::new(self.env.safekeeper_bin()?);
|
||||
fill_rust_env_vars(
|
||||
cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
|
||||
.args(&["--id", self.id.to_string().as_ref()])
|
||||
.args(&["--listen-pg", &listen_pg])
|
||||
.args(&["--listen-http", &listen_http])
|
||||
.arg("--daemonize"),
|
||||
);
|
||||
let id_string = id.to_string();
|
||||
let mut args = vec![
|
||||
"-D",
|
||||
datadir.to_str().with_context(|| {
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
"--id",
|
||||
&id_string,
|
||||
"--listen-pg",
|
||||
&listen_pg,
|
||||
"--listen-http",
|
||||
&listen_http,
|
||||
];
|
||||
if !self.conf.sync {
|
||||
cmd.arg("--no-sync");
|
||||
args.push("--no-sync");
|
||||
}
|
||||
|
||||
let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
|
||||
if !comma_separated_endpoints.is_empty() {
|
||||
cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
|
||||
args.extend(["--broker-endpoints", &comma_separated_endpoints]);
|
||||
}
|
||||
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
|
||||
cmd.args(&["--broker-etcd-prefix", prefix]);
|
||||
args.extend(["--broker-etcd-prefix", prefix]);
|
||||
}
|
||||
|
||||
let mut backup_threads = String::new();
|
||||
if let Some(threads) = self.conf.backup_threads {
|
||||
cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
|
||||
backup_threads = threads.to_string();
|
||||
args.extend(["--backup-threads", &backup_threads]);
|
||||
} else {
|
||||
drop(backup_threads);
|
||||
}
|
||||
|
||||
if let Some(ref remote_storage) = self.conf.remote_storage {
|
||||
cmd.args(&["--remote-storage", remote_storage]);
|
||||
args.extend(["--remote-storage", remote_storage]);
|
||||
}
|
||||
|
||||
let key_path = self.env.base_data_dir.join("auth_public_key.pem");
|
||||
if self.conf.auth_enabled {
|
||||
cmd.arg("--auth-validation-public-key-path");
|
||||
// PathBuf is better be passed as is, not via `String`.
|
||||
cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
|
||||
args.extend([
|
||||
"--auth-validation-public-key-path",
|
||||
key_path.to_str().with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
]);
|
||||
}
|
||||
|
||||
fill_aws_secrets_vars(&mut cmd);
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Safekeeper failed to start. See '{}' for details.",
|
||||
self.datadir_path().join("safekeeper.log").display()
|
||||
);
|
||||
}
|
||||
|
||||
// It takes a while for the safekeeper to start up. Wait until it is
|
||||
// open for business.
|
||||
const RETRIES: i8 = 15;
|
||||
for retries in 1..RETRIES {
|
||||
match self.check_status() {
|
||||
Ok(_) => {
|
||||
println!("\nSafekeeper started");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => {
|
||||
match err {
|
||||
SafekeeperHttpError::Transport(err) => {
|
||||
if err.is_connect() && retries < 5 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
} else {
|
||||
if retries == 5 {
|
||||
println!() // put a line break after dots for second message
|
||||
}
|
||||
println!(
|
||||
"Safekeeper not responding yet, err {} retrying ({})...",
|
||||
err, retries
|
||||
);
|
||||
}
|
||||
}
|
||||
SafekeeperHttpError::Response(msg) => {
|
||||
bail!("safekeeper failed to start: {} ", msg)
|
||||
}
|
||||
}
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("safekeeper failed to start in {} seconds", RETRIES);
|
||||
background_process::start_process(
|
||||
&format!("safekeeper {id}"),
|
||||
&datadir,
|
||||
&self.env.safekeeper_bin(),
|
||||
&args,
|
||||
background_process::InitialPidFile::Expect(&self.pid_file()),
|
||||
|| match self.check_status() {
|
||||
Ok(()) => Ok(true),
|
||||
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -202,63 +187,11 @@ impl SafekeeperNode {
|
||||
/// If the server is not running, returns success
|
||||
///
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
let pid_file = self.pid_file();
|
||||
if !pid_file.exists() {
|
||||
println!("Safekeeper {} is already stopped", self.id);
|
||||
return Ok(());
|
||||
}
|
||||
let pid = read_pidfile(&pid_file)?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping safekeeper {} immediately..", self.id);
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping safekeeper {} gracefully..", self.id);
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!(
|
||||
"Safekeeper with pid {} does not exist, but a PID file was found",
|
||||
pid
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to safekeeper with pid {}: {}",
|
||||
pid,
|
||||
err.desc()
|
||||
),
|
||||
}
|
||||
|
||||
// Wait until process is gone
|
||||
for i in 0..600 {
|
||||
let signal = None; // Send no signal, just get the error code
|
||||
match kill(pid, signal) {
|
||||
Ok(_) => (), // Process exists, keep waiting
|
||||
Err(Errno::ESRCH) => {
|
||||
// Process not found, we're done
|
||||
println!("done!");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to pageserver with pid {}: {}",
|
||||
pid,
|
||||
err.desc()
|
||||
),
|
||||
};
|
||||
|
||||
if i % 10 == 0 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
|
||||
bail!("Failed to stop safekeeper with pid {}", pid);
|
||||
background_process::stop_process(
|
||||
immediate,
|
||||
&format!("safekeeper {}", self.id),
|
||||
&self.pid_file(),
|
||||
)
|
||||
}
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
|
||||
13
docker-compose/compute_wrapper/Dockerfile
Normal file
13
docker-compose/compute_wrapper/Dockerfile
Normal file
@@ -0,0 +1,13 @@
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG COMPUTE_IMAGE=compute-node-v14
|
||||
ARG TAG=latest
|
||||
|
||||
FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
|
||||
|
||||
USER root
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl \
|
||||
jq \
|
||||
netcat
|
||||
|
||||
USER postgres
|
||||
@@ -2,6 +2,7 @@ version: '3'
|
||||
|
||||
services:
|
||||
etcd:
|
||||
restart: always
|
||||
image: quay.io/coreos/etcd:v3.5.4
|
||||
ports:
|
||||
- 2379:2379
|
||||
@@ -9,7 +10,7 @@ services:
|
||||
environment:
|
||||
# This signifficantly speeds up etcd and we anyway don't data persistency there.
|
||||
ETCD_UNSAFE_NO_FSYNC: "1"
|
||||
command:
|
||||
command:
|
||||
- "etcd"
|
||||
- "--auto-compaction-mode=revision"
|
||||
- "--auto-compaction-retention=1"
|
||||
@@ -24,6 +25,7 @@ services:
|
||||
- "--quota-backend-bytes=134217728" # 128 MB
|
||||
|
||||
minio:
|
||||
restart: always
|
||||
image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
|
||||
ports:
|
||||
- 9000:9000
|
||||
@@ -41,7 +43,7 @@ services:
|
||||
entrypoint:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
command:
|
||||
command:
|
||||
- "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
|
||||
echo 'Waiting to start minio...' && sleep 1;
|
||||
done;
|
||||
@@ -51,7 +53,8 @@ services:
|
||||
- minio
|
||||
|
||||
pageserver:
|
||||
image: neondatabase/neon:${TAG:-latest}
|
||||
restart: always
|
||||
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
|
||||
environment:
|
||||
- BROKER_ENDPOINT='http://etcd:2379'
|
||||
- AWS_ACCESS_KEY_ID=minio
|
||||
@@ -77,7 +80,8 @@ services:
|
||||
- minio_create_buckets
|
||||
|
||||
safekeeper1:
|
||||
image: neondatabase/neon:${TAG:-latest}
|
||||
restart: always
|
||||
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
|
||||
environment:
|
||||
- SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
|
||||
- SAFEKEEPER_ID=1
|
||||
@@ -106,7 +110,8 @@ services:
|
||||
- minio_create_buckets
|
||||
|
||||
safekeeper2:
|
||||
image: neondatabase/neon:${TAG:-latest}
|
||||
restart: always
|
||||
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
|
||||
environment:
|
||||
- SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
|
||||
- SAFEKEEPER_ID=2
|
||||
@@ -135,7 +140,8 @@ services:
|
||||
- minio_create_buckets
|
||||
|
||||
safekeeper3:
|
||||
image: neondatabase/neon:${TAG:-latest}
|
||||
restart: always
|
||||
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
|
||||
environment:
|
||||
- SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
|
||||
- SAFEKEEPER_ID=3
|
||||
@@ -164,18 +170,21 @@ services:
|
||||
- minio_create_buckets
|
||||
|
||||
compute:
|
||||
restart: always
|
||||
build:
|
||||
context: ./image/compute
|
||||
context: ./compute_wrapper/
|
||||
args:
|
||||
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
|
||||
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
|
||||
- TAG=${TAG:-latest}
|
||||
- http_proxy=$http_proxy
|
||||
- https_proxy=$https_proxy
|
||||
environment:
|
||||
- PG_VERSION=${PG_VERSION:-14}
|
||||
#- RUST_BACKTRACE=1
|
||||
# Mount the test files directly, for faster editing cycle.
|
||||
volumes:
|
||||
- ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
|
||||
- ./compute/shell/:/shell/
|
||||
- ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
|
||||
- ./compute_wrapper/shell/:/shell/
|
||||
ports:
|
||||
- 55433:55433 # pg protocol handler
|
||||
- 3080:3080 # http endpoints
|
||||
|
||||
60
docker-compose/docker_compose_test.sh
Executable file
60
docker-compose/docker_compose_test.sh
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
|
||||
# A basic test to ensure Docker images are built correctly.
|
||||
# Build a wrapper around the compute, start all services and runs a simple SQL query.
|
||||
# Repeats the process for all currenly supported Postgres versions.
|
||||
|
||||
# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
|
||||
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
|
||||
# to verify custom image builds (e.g pre-published ones).
|
||||
|
||||
# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
|
||||
|
||||
set -eux -o pipefail
|
||||
|
||||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
|
||||
|
||||
COMPUTE_CONTAINER_NAME=docker-compose-compute-1
|
||||
SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
|
||||
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
|
||||
|
||||
cleanup() {
|
||||
echo "show container information"
|
||||
docker ps
|
||||
docker compose -f $COMPOSE_FILE logs
|
||||
echo "stop containers..."
|
||||
docker compose -f $COMPOSE_FILE down
|
||||
}
|
||||
|
||||
echo "clean up containers if exists"
|
||||
cleanup
|
||||
|
||||
for pg_version in 14 15; do
|
||||
echo "start containers (pg_version=$pg_version)."
|
||||
PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
|
||||
|
||||
echo "wait until the compute is ready. timeout after 60s. "
|
||||
cnt=0
|
||||
while sleep 1; do
|
||||
# check timeout
|
||||
cnt=`expr $cnt + 1`
|
||||
if [ $cnt -gt 60 ]; then
|
||||
echo "timeout before the compute is ready."
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check if the compute is ready
|
||||
set +o pipefail
|
||||
result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
|
||||
set -o pipefail
|
||||
if [ $result -eq 1 ]; then
|
||||
echo "OK. The compute is ready to connect."
|
||||
echo "execute simple queries."
|
||||
docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
|
||||
cleanup
|
||||
break
|
||||
fi
|
||||
done
|
||||
done
|
||||
@@ -1,10 +0,0 @@
|
||||
ARG COMPUTE_IMAGE=compute-node-v14:latest
|
||||
FROM neondatabase/${COMPUTE_IMAGE}
|
||||
|
||||
USER root
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl \
|
||||
jq \
|
||||
netcat
|
||||
|
||||
USER postgres
|
||||
@@ -37,7 +37,7 @@
|
||||
|
||||
- [Source view](./sourcetree.md)
|
||||
- [docker.md](./docker.md) — Docker images and building pipeline.
|
||||
- [Error handling and logging]()
|
||||
- [Error handling and logging](./error-handling.md)
|
||||
- [Testing]()
|
||||
- [Unit testing]()
|
||||
- [Integration testing]()
|
||||
|
||||
198
docs/error-handling.md
Normal file
198
docs/error-handling.md
Normal file
@@ -0,0 +1,198 @@
|
||||
# Error handling and logging
|
||||
|
||||
## Logging errors
|
||||
|
||||
The principle is that errors are logged when they are handled. If you
|
||||
just propagate an error to the caller in a function, you don't need to
|
||||
log it; the caller will. But if you consume an error in a function,
|
||||
you *must* log it (if it needs to be logged at all).
|
||||
|
||||
For example:
|
||||
|
||||
```rust
|
||||
fn read_motd_file() -> std::io::Result<String> {
|
||||
let mut f = File::open("/etc/motd")?;
|
||||
let mut result = String::new();
|
||||
f.read_to_string(&mut result)?;
|
||||
result
|
||||
}
|
||||
```
|
||||
|
||||
Opening or reading the file could fail, but there is no need to log
|
||||
the error here. The function merely propagates the error to the
|
||||
caller, and it is up to the caller to log the error or propagate it
|
||||
further, if the failure is not expected. But if, for example, it is
|
||||
normal that the "/etc/motd" file doesn't exist, the caller can choose
|
||||
to silently ignore the error, or log it as an INFO or DEBUG level
|
||||
message:
|
||||
|
||||
```rust
|
||||
fn get_message_of_the_day() -> String {
|
||||
// Get the motd from /etc/motd, or return the default proverb
|
||||
match read_motd_file() {
|
||||
Ok(motd) => motd,
|
||||
Err(err) => {
|
||||
// It's normal that /etc/motd doesn't exist, but if we fail to
|
||||
// read it for some other reason, that's unexpected. The message
|
||||
// of the day isn't very important though, so we just WARN and
|
||||
// continue with the default in any case.
|
||||
if err.kind() != std::io::ErrorKind::NotFound {
|
||||
tracing::warn!("could not read \"/etc/motd\": {err:?}");
|
||||
}
|
||||
"An old error is always more popular than a new truth. - German proverb"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Error types
|
||||
|
||||
We use the `anyhow` crate widely. It contains many convenient macros
|
||||
like `bail!` and `ensure!` to construct and return errors, and to
|
||||
propagate many kinds of low-level errors, wrapped in `anyhow::Error`.
|
||||
|
||||
A downside of `anyhow::Error` is that the caller cannot distinguish
|
||||
between different error cases. Most errors are propagated all the way
|
||||
to the mgmt API handler function, or the main loop that handles a
|
||||
connection with the compute node, and they are all handled the same
|
||||
way: the error is logged and returned to the client as an HTTP or
|
||||
libpq error.
|
||||
|
||||
But in some cases, we need to distinguish between errors and handle
|
||||
them differently. For example, attaching a tenant to the pageserver
|
||||
could fail either because the tenant has already been attached, or
|
||||
because we could not load its metadata from cloud storage. The first
|
||||
case is more or less expected. The console sends the Attach request to
|
||||
the pageserver, and the pageserver completes the operation, but the
|
||||
network connection might be lost before the console receives the
|
||||
response. The console will retry the operation in that case, but the
|
||||
tenant has already been attached. It is important that the pagserver
|
||||
responds with the HTTP 403 Already Exists error in that case, rather
|
||||
than a generic HTTP 500 Internal Server Error.
|
||||
|
||||
If you need to distinguish between different kinds of errors, create a
|
||||
new `Error` type. The `thiserror` crate is useful for that. But in
|
||||
most cases `anyhow::Error` is good enough.
|
||||
|
||||
## Panics
|
||||
|
||||
Depending on where a panic happens, it can cause the whole pageserver
|
||||
or safekeeper to restart, or just a single tenant. In either case,
|
||||
that is pretty bad and causes an outage. Avoid panics. Never use
|
||||
`unwrap()` or other calls that might panic, to verify inputs from the
|
||||
network or from disk.
|
||||
|
||||
It is acceptable to use functions that might panic, like `unwrap()`, if
|
||||
it is obvious that it cannot panic. For example, if you have just
|
||||
checked that a variable is not None, it is OK to call `unwrap()` on it,
|
||||
but it is still preferable to use `expect("reason")` instead to explain
|
||||
why the function cannot fail.
|
||||
|
||||
`assert!` and `panic!` are reserved for checking clear invariants and
|
||||
very obvious "can't happen" cases. When in doubt, use anyhow `ensure!`
|
||||
or `bail!` instead.
|
||||
|
||||
## Error levels
|
||||
|
||||
`tracing::Level` doesn't provide very clear guidelines on what the
|
||||
different levels mean, or when to use which level. Here is how we use
|
||||
them:
|
||||
|
||||
### Error
|
||||
|
||||
Examples:
|
||||
- could not open file "foobar"
|
||||
- invalid tenant id
|
||||
|
||||
Errors are not expected to happen during normal operation. Incorrect
|
||||
inputs from client can cause ERRORs. For example, if a client tries to
|
||||
call a mgmt API that doesn't exist, or if a compute node sends passes
|
||||
an LSN that has already been garbage collected away.
|
||||
|
||||
These should *not* happen during normal operations. "Normal
|
||||
operations" is not a very precise concept. But for example, disk
|
||||
errors are not expected to happen when the system is working, so those
|
||||
count as Errors. However, if a TCP connection to a compute node is
|
||||
lost, that is not considered an Error, because it doesn't affect the
|
||||
pageserver's or safekeeper's operation in any way, and happens fairly
|
||||
frequently when compute nodes are shut down, or are killed abruptly
|
||||
because of errors in the compute.
|
||||
|
||||
**Errors are monitored, and always need human investigation to determine
|
||||
the cause.**
|
||||
|
||||
Whether something should be logged at ERROR, WARNING or INFO level can
|
||||
depend on the callers and clients. For example, it might be unexpected
|
||||
and a sign of a serious issue if the console calls the
|
||||
"timeline_detail" mgmt API for a timeline that doesn't exist. ERROR
|
||||
would be appropriate in that case. But if the console routinely calls
|
||||
the API after deleting a timeline, to check if the deletion has
|
||||
completed, then it would be totally normal and an INFO or DEBUG level
|
||||
message would be more appropriate. If a message is logged as an ERROR,
|
||||
but it in fact happens frequently in production and never requires any
|
||||
action, it should probably be demoted to an INFO level message.
|
||||
|
||||
### Warn
|
||||
|
||||
Examples:
|
||||
- could not remove temporary file "foobar.temp"
|
||||
- unrecognized file "foobar" in timeline directory
|
||||
|
||||
Warnings are similar to Errors, in that they should not happen
|
||||
when the system is operating normally. The difference between Error and
|
||||
Warning is that an Error means that the operation failed, whereas Warning
|
||||
means that something unexpected happened, but the operation continued anyway.
|
||||
For example, if deleting a file fails because the file already didn't exist,
|
||||
it should be logged as Warning.
|
||||
|
||||
> **Note:** The python regression tests, under `test_regress`, check the
|
||||
> pageserver log after each test for any ERROR and WARN lines. If there are
|
||||
> any ERRORs or WARNs that have not been explicitly listed in the test as
|
||||
> allowed, the test is marked a failed. This is to catch unexpected errors
|
||||
> e.g. in background operations, that don't cause immediate misbehaviour in
|
||||
> the tested functionality.
|
||||
|
||||
### Info
|
||||
|
||||
Info level is used to log useful information when the system is
|
||||
operating normally. Info level is appropriate e.g. for logging state
|
||||
changes, background operations, and network connections.
|
||||
|
||||
Examples:
|
||||
- "system is shutting down"
|
||||
- "tenant was created"
|
||||
- "retrying S3 upload"
|
||||
|
||||
### Debug & Trace
|
||||
|
||||
Debug and Trace level messages are not printed to the log in our normal
|
||||
production configuration, but could be enabled for a specific server or
|
||||
tenant, to aid debugging. (Although we don't actually have that
|
||||
capability as of this writing).
|
||||
|
||||
## Context
|
||||
|
||||
We use logging "spans" to hold context information about the current
|
||||
operation. Almost every operation happens on a particular tenant and
|
||||
timeline, so we enter a span with the "tenant_id" and "timeline_id"
|
||||
very early when processing an incoming API request, for example. All
|
||||
background operations should also run in a span containing at least
|
||||
those two fields, and any other parameters or information that might
|
||||
be useful when debugging an error that might happen when performing
|
||||
the operation.
|
||||
|
||||
TODO: Spans are not captured in the Error when it is created, but when
|
||||
the error is logged. It would be more useful to capture them at Error
|
||||
creation. We should consider using `tracing_error::SpanTrace` to do
|
||||
that.
|
||||
|
||||
## Error message style
|
||||
|
||||
PostgreSQL has a style guide for writing error messages:
|
||||
|
||||
https://www.postgresql.org/docs/current/error-style-guide.html
|
||||
|
||||
Follow that guide when writing error messages in the PostgreSQL
|
||||
extension. We don't follow it strictly in the pageserver and
|
||||
safekeeper, but the advice in the PostgreSQL style guide is generally
|
||||
good, and you can't go wrong by following it.
|
||||
246
docs/rfcs/020-pageserver-s3-coordination.md
Normal file
246
docs/rfcs/020-pageserver-s3-coordination.md
Normal file
@@ -0,0 +1,246 @@
|
||||
# Coordinating access of multiple pageservers to the same s3 data
|
||||
|
||||
## Motivation
|
||||
|
||||
There are some blind spots around coordinating access of multiple pageservers
|
||||
to the same s3 data. Currently this is applicable only to tenant relocation
|
||||
case, but in the future we'll need to solve similar problems for
|
||||
replica/standby pageservers.
|
||||
|
||||
## Impacted components (e.g. pageserver, safekeeper, console, etc)
|
||||
|
||||
Pageserver
|
||||
|
||||
## The problem
|
||||
|
||||
### Relocation
|
||||
|
||||
During relocation both pageservers can write to s3. This should be ok for all
|
||||
data except the `index_part.json`. For index part it causes problems during
|
||||
compaction/gc because they remove files from index/s3.
|
||||
|
||||
Imagine this case:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant PS1
|
||||
participant S3
|
||||
participant PS2
|
||||
|
||||
PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
|
||||
PS2->>S3: Attach called, sees L1, L2
|
||||
PS1->>S3: Compaction comes <br/> Removes L1, adds L3
|
||||
note over S3: Index now L2, L3
|
||||
PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
|
||||
note over S3: Index now L1, L2, L4
|
||||
```
|
||||
|
||||
At this point it is not possible to restore from index, it contains L2 which
|
||||
is no longer available in s3 and doesnt contain L3 added by compaction by the
|
||||
first pageserver. So if any of the pageservers restart initial sync will fail
|
||||
(or in on-demand world it will fail a bit later during page request from
|
||||
missing layer)
|
||||
|
||||
### Standby pageserver
|
||||
|
||||
Another related case is standby pageserver. In this case second pageserver can
|
||||
be used as a replica to scale reads and serve as a failover target in case
|
||||
first one fails.
|
||||
|
||||
In this mode second pageserver needs to have the same picture of s3 files to
|
||||
be able to load layers on-demand. To accomplish that second pageserver
|
||||
cannot run gc/compaction jobs. Instead it needs to receive updates for index
|
||||
contents. (There is no need to run walreceiver on the second pageserver then).
|
||||
|
||||
## Observations
|
||||
|
||||
- If both pageservers ingest wal then their layer set diverges, because layer
|
||||
file generation is not deterministic
|
||||
- If one of the pageservers does not ingest wal (and just picks up layer
|
||||
updates) then it lags behind and cannot really answer queries in the same
|
||||
pace as the primary one
|
||||
- Can compaction help make layers deterministic? E g we do not upload level
|
||||
zero layers and construction of higher levels should be deterministic.
|
||||
This way we can guarantee that layer creation by timeout wont mess things up.
|
||||
This way one pageserver uploads data and second one can just ingest it.
|
||||
But we still need some form of election
|
||||
|
||||
## Solutions
|
||||
|
||||
### Manual orchestration
|
||||
|
||||
One possible solution for relocation case is to orchestrate background jobs
|
||||
from outside. The oracle who runs migration can turn off background jobs on
|
||||
PS1 before migration and then run migration -> enable them on PS2. The problem
|
||||
comes if migration fails. In this case in order to resume background jobs
|
||||
oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
|
||||
respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
|
||||
without human ensuring that no upload from PS2 can happen. In order to be able
|
||||
to resolve this automatically CAS is required on S3 side so pageserver can
|
||||
avoid overwriting index part if it is no longer the leading one
|
||||
|
||||
Note that flag that disables background jobs needs to be persistent, because
|
||||
otherwise pageserver restart will clean it
|
||||
|
||||
### Avoid index_part.json
|
||||
|
||||
Index part consists of two parts, list of layers and metadata. List of layers
|
||||
can be easily obtained by `ListObjects` S3 API method. But what to do with
|
||||
metadata? Create metadata instance for each checkpoint and add some counter
|
||||
to the file name?
|
||||
|
||||
Back to potentially long s3 ls.
|
||||
|
||||
### Coordination based approach
|
||||
|
||||
Do it like safekeepers chose leader for WAL upload. Ping each other and decide
|
||||
based on some heuristics e g smallest node id. During relocation PS1 sends
|
||||
"resign" ping message so others can start election without waiting for a timeout.
|
||||
|
||||
This still leaves metadata question open and non deterministic layers are a
|
||||
problem as well
|
||||
|
||||
### Avoid metadata file
|
||||
|
||||
One way to eliminate metadata file is to store it in layer files under some
|
||||
special key. This may resonate with intention to keep all relation sizes in
|
||||
some special segment to avoid initial download during size calculation.
|
||||
Maybe with that we can even store pre calculated value.
|
||||
|
||||
As a downside each checkpoint gets 512 bytes larger.
|
||||
|
||||
If we entirely avoid metadata file this opens up many approaches
|
||||
|
||||
* * *
|
||||
|
||||
During discussion it seems that we converged on the approach consisting of:
|
||||
|
||||
- index files stored per pageserver in the same timeline directory. With that
|
||||
index file name starts to look like: `<pageserver_node_id>_index_part.json`.
|
||||
In such set up there are no concurrent overwrites of index file by different
|
||||
pageservers.
|
||||
- For replica pageservers the solution would be for primary to broadcast index
|
||||
changes to any followers with an ability to check index files in s3 and
|
||||
restore the full state. To properly merge changes with index files we can use
|
||||
a counter that is persisted in an index file, is incremented on every change
|
||||
to it and passed along with broadcasted change. This way we can determine
|
||||
whether we need to apply change to the index state or not.
|
||||
- Responsibility for running background jobs is assigned externally. Pageserver
|
||||
keeps locally persistent flag for each tenant that indicates whether this
|
||||
pageserver is considered as primary one or not. TODO what happends if we
|
||||
crash and cannot start for some extended period of time? Control plane can
|
||||
assign ownership to some other pageserver. Pageserver needs some way to check
|
||||
if its still the blessed one. Maybe by explicit request to control plane on
|
||||
start.
|
||||
|
||||
Requirement for deterministic layer generation was considered overly strict
|
||||
because of two reasons:
|
||||
|
||||
- It can limit possible optimizations e g when pageserver wants to reshuffle
|
||||
some data locally and doesnt want to coordinate this
|
||||
- The deterministic algorithm itself can change so during deployments for some
|
||||
time there will be two different version running at the same time which can
|
||||
cause non determinism
|
||||
|
||||
### External elections
|
||||
|
||||
The above case with lost state in this schema with externally managed
|
||||
leadership is represented like this:
|
||||
|
||||
Note that here we keep objects list in the index file.
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant PS1
|
||||
participant CP as Control Plane
|
||||
participant S3
|
||||
participant PS2
|
||||
|
||||
note over PS1,PS2: PS1 starts up and still a leader
|
||||
PS1->>CP: Am I still the leader for Tenant X?
|
||||
activate CP
|
||||
CP->>PS1: Yes
|
||||
deactivate CP
|
||||
PS1->>S3: Fetch PS1 index.
|
||||
note over PS1: Continue operations, start backround jobs
|
||||
note over PS1,PS2: PS1 starts up and still and is not a leader anymore
|
||||
PS1->>CP: Am I still the leader for Tenant X?
|
||||
CP->>PS1: No
|
||||
PS1->>PS2: Subscribe to index changes
|
||||
PS1->>S3: Fetch PS1 and PS2 indexes
|
||||
note over PS1: Combine index file to include layers <br> from both indexes to be able <br> to see newer files from leader (PS2)
|
||||
note over PS1: Continue operations, do not start background jobs
|
||||
```
|
||||
|
||||
### Internal elections
|
||||
|
||||
To manage leadership internally we can use broker to exchange pings so nodes
|
||||
can decide on the leader roles. In case multiple pageservers are active leader
|
||||
is the one with lowest node id.
|
||||
|
||||
Operations with internally managed elections:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant PS1
|
||||
participant S3
|
||||
|
||||
note over PS1: Starts up
|
||||
note over PS1: Subscribes to changes, waits for two ping <br> timeouts to see if there is a leader
|
||||
PS1->>S3: Fetch indexes from s3
|
||||
alt there is a leader
|
||||
note over PS1: do not start background jobs, <br> continue applying index updates
|
||||
else there is no leader
|
||||
note over PS1: start background jobs, <br> broadcast index changes
|
||||
end
|
||||
|
||||
note over PS1,S3: Then the picture is similar to external elections <br> the difference is that follower can become a leader <br> if there are no pings after some timeout new leader gets elected
|
||||
```
|
||||
|
||||
### Eviction
|
||||
|
||||
When two pageservers operate on a tenant for extended period of time follower
|
||||
doesnt perform write operations in s3. When layer is evicted follower relies
|
||||
on updates from primary to get info about layers it needs to cover range for
|
||||
evicted layer.
|
||||
|
||||
Note that it wont match evicted layer exactly, so layers will overlap and
|
||||
lookup code needs to correctly handle that.
|
||||
|
||||
### Relocation flow
|
||||
|
||||
Actions become:
|
||||
|
||||
- Attach tenant to new pageserver
|
||||
- New pageserver becomes follower since previous one is still leading
|
||||
- New pageserver starts replicating from safekeepers but does not upload layers
|
||||
- Detach is called on the old one
|
||||
- New pageserver becomes leader after it realizes that old one disappeared
|
||||
|
||||
### Index File
|
||||
|
||||
Using `s3 ls` on startup simplifies things, but we still need metadata, so we
|
||||
need to fetch index files anyway. If they contain list of files we can combine
|
||||
them and avoid costly `s3 ls`
|
||||
|
||||
### Remaining issues
|
||||
|
||||
- More than one remote consistent lsn for safekeepers to know
|
||||
|
||||
Anything else?
|
||||
|
||||
### Proposed solution
|
||||
|
||||
To recap. On meeting we converged on approach with external elections but I
|
||||
think it will be overall harder to manage and will introduce a dependency on
|
||||
control plane for pageserver. Using separate index files for each pageserver
|
||||
consisting of log of operations and a metadata snapshot should be enough.
|
||||
|
||||
### What we need to get there?
|
||||
|
||||
- Change index file structure to contain log of changes instead of just the
|
||||
file list
|
||||
- Implement pinging/elections for pageservers
|
||||
@@ -52,6 +52,10 @@ PostgreSQL extension that implements storage manager API and network communicati
|
||||
|
||||
PostgreSQL extension that contains functions needed for testing and debugging.
|
||||
|
||||
`/pgxn/neon_walredo`:
|
||||
|
||||
Library to run Postgres as a "WAL redo process" in the pageserver.
|
||||
|
||||
`/safekeeper`:
|
||||
|
||||
The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
|
||||
@@ -79,6 +83,16 @@ A subject for future modularization.
|
||||
`/libs/metrics`:
|
||||
Helpers for exposing Prometheus metrics from the server.
|
||||
|
||||
### Adding dependencies
|
||||
When you add a Cargo dependency, you should update hakari manifest by running commands below and committing the updated `Cargo.lock` and `workspace_hack/`. There may be no changes, that's fine.
|
||||
|
||||
```bash
|
||||
cargo hakari generate
|
||||
cargo hakari manage-deps
|
||||
```
|
||||
|
||||
If you don't have hakari installed (`error: no such subcommand: hakari`), install it by running `cargo install cargo-hakari`.
|
||||
|
||||
## Using Python
|
||||
Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
|
||||
so manual installation of dependencies is not recommended.
|
||||
|
||||
@@ -9,6 +9,7 @@ serde_with = "2.0"
|
||||
const_format = "0.2.21"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::num::NonZeroU64;
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::{
|
||||
@@ -9,7 +10,7 @@ use utils::{
|
||||
|
||||
use crate::reltag::RelTag;
|
||||
use anyhow::bail;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
|
||||
/// A state of a tenant in pageserver's memory.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -72,6 +73,7 @@ pub struct TenantCreateRequest {
|
||||
pub walreceiver_connect_timeout: Option<String>,
|
||||
pub lagging_wal_timeout: Option<String>,
|
||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
pub trace_read_requests: Option<bool>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -111,6 +113,7 @@ pub struct TenantConfigRequest {
|
||||
pub walreceiver_connect_timeout: Option<String>,
|
||||
pub lagging_wal_timeout: Option<String>,
|
||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
pub trace_read_requests: Option<bool>,
|
||||
}
|
||||
|
||||
impl TenantConfigRequest {
|
||||
@@ -129,6 +132,7 @@ impl TenantConfigRequest {
|
||||
walreceiver_connect_timeout: None,
|
||||
lagging_wal_timeout: None,
|
||||
max_lsn_wal_lag: None,
|
||||
trace_read_requests: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -225,6 +229,7 @@ pub struct TimelineGcRequest {
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(PartialEq, Eq)]
|
||||
pub enum PagestreamFeMessage {
|
||||
Exists(PagestreamExistsRequest),
|
||||
Nblocks(PagestreamNblocksRequest),
|
||||
@@ -241,21 +246,21 @@ pub enum PagestreamBeMessage {
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamExistsRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamNblocksRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamGetPageRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
@@ -263,7 +268,7 @@ pub struct PagestreamGetPageRequest {
|
||||
pub blkno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamDbSizeRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
@@ -296,52 +301,98 @@ pub struct PagestreamDbSizeResponse {
|
||||
}
|
||||
|
||||
impl PagestreamFeMessage {
|
||||
pub fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
|
||||
pub fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
match self {
|
||||
Self::Exists(req) => {
|
||||
bytes.put_u8(0);
|
||||
bytes.put_u8(if req.latest { 1 } else { 0 });
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
}
|
||||
|
||||
Self::Nblocks(req) => {
|
||||
bytes.put_u8(1);
|
||||
bytes.put_u8(if req.latest { 1 } else { 0 });
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
}
|
||||
|
||||
Self::GetPage(req) => {
|
||||
bytes.put_u8(2);
|
||||
bytes.put_u8(if req.latest { 1 } else { 0 });
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
bytes.put_u32(req.blkno);
|
||||
}
|
||||
|
||||
Self::DbSize(req) => {
|
||||
bytes.put_u8(3);
|
||||
bytes.put_u8(if req.latest { 1 } else { 0 });
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u32(req.dbnode);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// TODO these gets can fail
|
||||
|
||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||
//
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
let msg_tag = body.get_u8();
|
||||
let msg_tag = body.read_u8()?;
|
||||
match msg_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
latest: body.read_u8()? != 0,
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
rel: RelTag {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
})),
|
||||
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
latest: body.read_u8()? != 0,
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
rel: RelTag {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
})),
|
||||
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
latest: body.read_u8()? != 0,
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
rel: RelTag {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
blkno: body.get_u32(),
|
||||
blkno: body.read_u32::<BigEndian>()?,
|
||||
})),
|
||||
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
dbnode: body.get_u32(),
|
||||
latest: body.read_u8()? != 0,
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
})),
|
||||
_ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
|
||||
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -380,3 +431,58 @@ impl PagestreamBeMessage {
|
||||
bytes.into()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use bytes::Buf;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pagestream() {
|
||||
// Test serialization/deserialization of PagestreamFeMessage
|
||||
let messages = vec![
|
||||
PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
latest: true,
|
||||
lsn: Lsn(4),
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
dbnode: 3,
|
||||
relnode: 4,
|
||||
},
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
latest: false,
|
||||
lsn: Lsn(4),
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
dbnode: 3,
|
||||
relnode: 4,
|
||||
},
|
||||
}),
|
||||
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
latest: true,
|
||||
lsn: Lsn(4),
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
dbnode: 3,
|
||||
relnode: 4,
|
||||
},
|
||||
blkno: 7,
|
||||
}),
|
||||
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
latest: true,
|
||||
lsn: Lsn(4),
|
||||
dbnode: 7,
|
||||
}),
|
||||
];
|
||||
for msg in messages {
|
||||
let bytes = msg.serialize();
|
||||
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
|
||||
assert!(msg == reconstructed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
12
libs/persistent_range_query/Cargo.toml
Normal file
12
libs/persistent_range_query/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "persistent_range_query"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8.3"
|
||||
78
libs/persistent_range_query/src/lib.rs
Normal file
78
libs/persistent_range_query/src/lib.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
use std::ops::Range;
|
||||
|
||||
pub mod naive;
|
||||
pub mod ops;
|
||||
pub mod segment_tree;
|
||||
|
||||
/// Should be a monoid:
|
||||
/// * Identity element: for all a: combine(new_for_empty_range(), a) = combine(a, new_for_empty_range()) = a
|
||||
/// * Associativity: for all a, b, c: combine(combine(a, b), c) == combine(a, combine(b, c))
|
||||
pub trait RangeQueryResult<Key>: Sized + Clone {
|
||||
// Clone is equivalent to combine with an empty range.
|
||||
|
||||
fn new_for_empty_range() -> Self;
|
||||
|
||||
// Contract: left_range.end == right_range.start
|
||||
// left_range.start == left_range.end == right_range.start == right_range.end is still possible
|
||||
fn combine(
|
||||
left: &Self,
|
||||
left_range: &Range<Key>,
|
||||
right: &Self,
|
||||
right_range: &Range<Key>,
|
||||
) -> Self;
|
||||
|
||||
fn add(left: &mut Self, left_range: &Range<Key>, right: &Self, right_range: &Range<Key>);
|
||||
}
|
||||
|
||||
pub trait LazyRangeInitializer<Result: RangeQueryResult<Key>, Key> {
|
||||
fn get(&self, range: &Range<Key>) -> Result;
|
||||
}
|
||||
|
||||
/// Should be a monoid:
|
||||
/// * Identity element: for all op: compose(no_op(), op) == compose(op, no_op()) == op
|
||||
/// * Associativity: for all op_1, op_2, op_3: compose(compose(op_1, op_2), op_3) == compose(op_1, compose(op_2, op_3))
|
||||
///
|
||||
/// Should left act on Result:
|
||||
/// * Identity operation: for all r: no_op().apply(r) == r
|
||||
/// * Compatibility: for all op_1, op_2, r: op_1.apply(op_2.apply(r)) == compose(op_1, op_2).apply(r)
|
||||
pub trait RangeModification<Key> {
|
||||
type Result: RangeQueryResult<Key>;
|
||||
|
||||
fn no_op() -> Self;
|
||||
fn is_no_op(&self) -> bool;
|
||||
fn is_reinitialization(&self) -> bool;
|
||||
fn apply(&self, result: &mut Self::Result, range: &Range<Key>);
|
||||
fn compose(later: &Self, earlier: &mut Self);
|
||||
}
|
||||
|
||||
pub trait VecReadableVersion<Modification: RangeModification<Key>, Key> {
|
||||
fn get(&self, keys: &Range<Key>) -> Modification::Result;
|
||||
}
|
||||
|
||||
// TODO: use trait alias when stabilized
|
||||
pub trait VecFrozenVersion<Modification: RangeModification<Key>, Key>:
|
||||
Clone + VecReadableVersion<Modification, Key>
|
||||
{
|
||||
}
|
||||
|
||||
impl<
|
||||
T: Clone + VecReadableVersion<Modification, Key>,
|
||||
Modification: RangeModification<Key>,
|
||||
Key,
|
||||
> VecFrozenVersion<Modification, Key> for T
|
||||
{
|
||||
}
|
||||
|
||||
pub trait PersistentVecStorage<
|
||||
Modification: RangeModification<Key>,
|
||||
Initializer: LazyRangeInitializer<Modification::Result, Key>,
|
||||
Key,
|
||||
>: VecReadableVersion<Modification, Key>
|
||||
{
|
||||
fn new(all_keys: Range<Key>, initializer: Initializer) -> Self;
|
||||
|
||||
type FrozenVersion: VecFrozenVersion<Modification, Key>;
|
||||
|
||||
fn modify(&mut self, keys: &Range<Key>, modification: &Modification);
|
||||
fn freeze(&mut self) -> Self::FrozenVersion;
|
||||
}
|
||||
115
libs/persistent_range_query/src/naive.rs
Normal file
115
libs/persistent_range_query/src/naive.rs
Normal file
@@ -0,0 +1,115 @@
|
||||
use crate::{
|
||||
LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
|
||||
VecReadableVersion,
|
||||
};
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::Range;
|
||||
use std::rc::Rc;
|
||||
|
||||
pub struct NaiveFrozenVersion<Modification: RangeModification<Key>, Key> {
|
||||
all_keys: Range<Key>,
|
||||
values: Rc<Box<Vec<Modification::Result>>>,
|
||||
}
|
||||
|
||||
pub trait IndexableKey: Clone {
|
||||
fn index(all_keys: &Range<Self>, key: &Self) -> usize;
|
||||
fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self>;
|
||||
}
|
||||
|
||||
fn get<Modification: RangeModification<Key>, Key: IndexableKey>(
|
||||
all_keys: &Range<Key>,
|
||||
values: &Vec<Modification::Result>,
|
||||
keys: &Range<Key>,
|
||||
) -> Modification::Result {
|
||||
let mut result = Modification::Result::new_for_empty_range();
|
||||
let mut result_range = keys.start.clone()..keys.start.clone();
|
||||
for index in
|
||||
IndexableKey::index(&all_keys, &keys.start)..IndexableKey::index(&all_keys, &keys.end)
|
||||
{
|
||||
let element_range = IndexableKey::element_range(&all_keys, index);
|
||||
Modification::Result::add(&mut result, &result_range, &values[index], &element_range);
|
||||
result_range.end = element_range.end;
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
impl<Modification: RangeModification<Key>, Key: IndexableKey> VecReadableVersion<Modification, Key>
|
||||
for NaiveFrozenVersion<Modification, Key>
|
||||
{
|
||||
fn get(&self, keys: &Range<Key>) -> Modification::Result {
|
||||
get::<Modification, Key>(&self.all_keys, &self.values, keys)
|
||||
}
|
||||
}
|
||||
|
||||
// Manual implementation of `Clone` becase `derive` requires `Modification: Clone`
|
||||
impl<Modification: RangeModification<Key>, Key: Clone> Clone
|
||||
for NaiveFrozenVersion<Modification, Key>
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
all_keys: self.all_keys.clone(),
|
||||
values: self.values.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: is it at all possible to store previous versions in this struct,
|
||||
// without any Rc<>?
|
||||
pub struct NaiveVecStorage<
|
||||
Modification: RangeModification<Key>,
|
||||
Initializer: LazyRangeInitializer<Modification::Result, Key>,
|
||||
Key: IndexableKey,
|
||||
> {
|
||||
all_keys: Range<Key>,
|
||||
last_version: Vec<Modification::Result>,
|
||||
_initializer: PhantomData<Initializer>,
|
||||
}
|
||||
|
||||
impl<
|
||||
Modification: RangeModification<Key>,
|
||||
Initializer: LazyRangeInitializer<Modification::Result, Key>,
|
||||
Key: IndexableKey,
|
||||
> VecReadableVersion<Modification, Key> for NaiveVecStorage<Modification, Initializer, Key>
|
||||
{
|
||||
fn get(&self, keys: &Range<Key>) -> Modification::Result {
|
||||
get::<Modification, Key>(&self.all_keys, &self.last_version, keys)
|
||||
}
|
||||
}
|
||||
|
||||
impl<
|
||||
Modification: RangeModification<Key>,
|
||||
Initializer: LazyRangeInitializer<Modification::Result, Key>,
|
||||
Key: IndexableKey,
|
||||
> PersistentVecStorage<Modification, Initializer, Key>
|
||||
for NaiveVecStorage<Modification, Initializer, Key>
|
||||
{
|
||||
fn new(all_keys: Range<Key>, initializer: Initializer) -> Self {
|
||||
let mut values = Vec::with_capacity(IndexableKey::index(&all_keys, &all_keys.end));
|
||||
for index in 0..values.capacity() {
|
||||
values.push(initializer.get(&IndexableKey::element_range(&all_keys, index)));
|
||||
}
|
||||
NaiveVecStorage {
|
||||
all_keys,
|
||||
last_version: values,
|
||||
_initializer: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
type FrozenVersion = NaiveFrozenVersion<Modification, Key>;
|
||||
|
||||
fn modify(&mut self, keys: &Range<Key>, modification: &Modification) {
|
||||
for index in IndexableKey::index(&self.all_keys, &keys.start)
|
||||
..IndexableKey::index(&self.all_keys, &keys.end)
|
||||
{
|
||||
let element_range = IndexableKey::element_range(&self.all_keys, index);
|
||||
modification.apply(&mut self.last_version[index], &element_range);
|
||||
}
|
||||
}
|
||||
|
||||
fn freeze(&mut self) -> Self::FrozenVersion {
|
||||
NaiveFrozenVersion::<Modification, Key> {
|
||||
all_keys: self.all_keys.clone(),
|
||||
values: Rc::new(Box::new(self.last_version.clone())),
|
||||
}
|
||||
}
|
||||
}
|
||||
14
libs/persistent_range_query/src/ops/mod.rs
Normal file
14
libs/persistent_range_query/src/ops/mod.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
pub mod rsq;
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct SameElementsInitializer<T> {
|
||||
initial_element_value: T,
|
||||
}
|
||||
|
||||
impl<T> SameElementsInitializer<T> {
|
||||
pub fn new(initial_element_value: T) -> Self {
|
||||
SameElementsInitializer {
|
||||
initial_element_value,
|
||||
}
|
||||
}
|
||||
}
|
||||
118
libs/persistent_range_query/src/ops/rsq.rs
Normal file
118
libs/persistent_range_query/src/ops/rsq.rs
Normal file
@@ -0,0 +1,118 @@
|
||||
//! # Range Sum Query
|
||||
|
||||
use crate::ops::SameElementsInitializer;
|
||||
use crate::{LazyRangeInitializer, RangeModification, RangeQueryResult};
|
||||
use std::borrow::Borrow;
|
||||
use std::ops::{Add, AddAssign, Range};
|
||||
|
||||
// TODO: commutative Add
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct SumResult<T> {
|
||||
sum: T,
|
||||
}
|
||||
|
||||
impl<T> SumResult<T> {
|
||||
pub fn sum(&self) -> &T {
|
||||
&self.sum
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Clone + for<'a> AddAssign<&'a T> + From<u8>, Key> RangeQueryResult<Key> for SumResult<T>
|
||||
where
|
||||
for<'a> &'a T: Add<&'a T, Output = T>,
|
||||
{
|
||||
fn new_for_empty_range() -> Self {
|
||||
SumResult { sum: 0.into() }
|
||||
}
|
||||
|
||||
fn combine(
|
||||
left: &Self,
|
||||
_left_range: &Range<Key>,
|
||||
right: &Self,
|
||||
_right_range: &Range<Key>,
|
||||
) -> Self {
|
||||
SumResult {
|
||||
sum: &left.sum + &right.sum,
|
||||
}
|
||||
}
|
||||
|
||||
fn add(left: &mut Self, _left_range: &Range<Key>, right: &Self, _right_range: &Range<Key>) {
|
||||
left.sum += &right.sum
|
||||
}
|
||||
}
|
||||
|
||||
pub trait SumOfSameElements<Key> {
|
||||
fn sum(initial_element_value: &Self, keys: &Range<Key>) -> Self;
|
||||
}
|
||||
|
||||
impl<T: SumOfSameElements<Key>, TB: Borrow<T>, Key> LazyRangeInitializer<SumResult<T>, Key>
|
||||
for SameElementsInitializer<TB>
|
||||
where
|
||||
SumResult<T>: RangeQueryResult<Key>,
|
||||
{
|
||||
fn get(&self, range: &Range<Key>) -> SumResult<T> {
|
||||
SumResult {
|
||||
sum: SumOfSameElements::sum(self.initial_element_value.borrow(), range),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum AddAssignModification<T> {
|
||||
None,
|
||||
Add(T),
|
||||
Assign(T),
|
||||
}
|
||||
|
||||
impl<T: Clone + for<'a> AddAssign<&'a T>, Key> RangeModification<Key> for AddAssignModification<T>
|
||||
where
|
||||
SumResult<T>: RangeQueryResult<Key>,
|
||||
for<'a> SameElementsInitializer<&'a T>: LazyRangeInitializer<SumResult<T>, Key>,
|
||||
{
|
||||
type Result = SumResult<T>;
|
||||
|
||||
fn no_op() -> Self {
|
||||
AddAssignModification::None
|
||||
}
|
||||
|
||||
fn is_no_op(&self) -> bool {
|
||||
match self {
|
||||
AddAssignModification::None => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn is_reinitialization(&self) -> bool {
|
||||
match self {
|
||||
AddAssignModification::Assign(_) => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn apply(&self, result: &mut SumResult<T>, range: &Range<Key>) {
|
||||
use AddAssignModification::*;
|
||||
match self {
|
||||
None => {}
|
||||
Add(x) | Assign(x) => {
|
||||
let to_add = SameElementsInitializer::new(x).get(range).sum;
|
||||
if let Assign(_) = self {
|
||||
result.sum = to_add;
|
||||
} else {
|
||||
result.sum += &to_add;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn compose(later: &Self, earlier: &mut Self) {
|
||||
use AddAssignModification::*;
|
||||
match (later, earlier) {
|
||||
(_, e @ None) => *e = later.clone(),
|
||||
(None, _) => {}
|
||||
(Assign(_), e) => *e = later.clone(),
|
||||
(Add(x), Add(y)) => *y += x,
|
||||
(Add(x), Assign(value)) => *value += x,
|
||||
}
|
||||
}
|
||||
}
|
||||
255
libs/persistent_range_query/src/segment_tree.rs
Normal file
255
libs/persistent_range_query/src/segment_tree.rs
Normal file
@@ -0,0 +1,255 @@
|
||||
//! # Segment Tree
|
||||
//! It is a competitive programming folklore data structure. Do not confuse with the interval tree.
|
||||
|
||||
use crate::{LazyRangeInitializer, PersistentVecStorage, RangeQueryResult, VecReadableVersion};
|
||||
use std::ops::Range;
|
||||
use std::rc::Rc;
|
||||
|
||||
pub trait MidpointableKey: Clone + Ord + Sized {
|
||||
fn midpoint(range: &Range<Self>) -> Self;
|
||||
}
|
||||
|
||||
pub trait RangeModification<Key>: Clone + crate::RangeModification<Key> {}
|
||||
|
||||
// TODO: use trait alias when stabilized
|
||||
impl<T: Clone + crate::RangeModification<Key>, Key> RangeModification<Key> for T {}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Node<Modification: RangeModification<Key>, Key> {
|
||||
result: Modification::Result,
|
||||
modify_children: Modification,
|
||||
left: Option<Rc<Self>>,
|
||||
right: Option<Rc<Self>>,
|
||||
}
|
||||
|
||||
// Manual implementation because we don't need `Key: Clone` for this, unlike with `derive`.
|
||||
impl<Modification: RangeModification<Key>, Key> Clone for Node<Modification, Key> {
|
||||
fn clone(&self) -> Self {
|
||||
Node {
|
||||
result: self.result.clone(),
|
||||
modify_children: self.modify_children.clone(),
|
||||
left: self.left.clone(),
|
||||
right: self.right.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Modification: RangeModification<Key>, Key> Node<Modification, Key> {
|
||||
fn new<Initializer: LazyRangeInitializer<Modification::Result, Key>>(
|
||||
range: &Range<Key>,
|
||||
initializer: &Initializer,
|
||||
) -> Self {
|
||||
Node {
|
||||
result: initializer.get(range),
|
||||
modify_children: Modification::no_op(),
|
||||
left: None,
|
||||
right: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn apply(&mut self, modification: &Modification, range: &Range<Key>) {
|
||||
modification.apply(&mut self.result, range);
|
||||
Modification::compose(modification, &mut self.modify_children);
|
||||
if self.modify_children.is_reinitialization() {
|
||||
self.left = None;
|
||||
self.right = None;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn force_children<Initializer: LazyRangeInitializer<Modification::Result, Key>>(
|
||||
&mut self,
|
||||
initializer: &Initializer,
|
||||
range_left: &Range<Key>,
|
||||
range_right: &Range<Key>,
|
||||
) {
|
||||
let left = Rc::make_mut(
|
||||
self.left
|
||||
.get_or_insert_with(|| Rc::new(Node::new(&range_left, initializer))),
|
||||
);
|
||||
let right = Rc::make_mut(
|
||||
self.right
|
||||
.get_or_insert_with(|| Rc::new(Node::new(&range_right, initializer))),
|
||||
);
|
||||
left.apply(&self.modify_children, &range_left);
|
||||
right.apply(&self.modify_children, &range_right);
|
||||
self.modify_children = Modification::no_op();
|
||||
}
|
||||
|
||||
pub fn recalculate_from_children(&mut self, range_left: &Range<Key>, range_right: &Range<Key>) {
|
||||
assert!(self.modify_children.is_no_op());
|
||||
assert!(self.left.is_some());
|
||||
assert!(self.right.is_some());
|
||||
self.result = Modification::Result::combine(
|
||||
&self.left.as_ref().unwrap().result,
|
||||
&range_left,
|
||||
&self.right.as_ref().unwrap().result,
|
||||
&range_right,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn split_range<Key: MidpointableKey>(range: &Range<Key>) -> (Range<Key>, Range<Key>) {
|
||||
let range_left = range.start.clone()..MidpointableKey::midpoint(range);
|
||||
let range_right = range_left.end.clone()..range.end.clone();
|
||||
(range_left, range_right)
|
||||
}
|
||||
|
||||
pub struct PersistentSegmentTreeVersion<
|
||||
Modification: RangeModification<Key>,
|
||||
Initializer: LazyRangeInitializer<Modification::Result, Key>,
|
||||
Key: Clone,
|
||||
> {
|
||||
root: Rc<Node<Modification, Key>>,
|
||||
all_keys: Range<Key>,
|
||||
initializer: Rc<Initializer>,
|
||||
}
|
||||
|
||||
// Manual implementation because we don't need `Key: Clone` for this, unlike with `derive`.
|
||||
impl<
|
||||
Modification: RangeModification<Key>,
|
||||
Initializer: LazyRangeInitializer<Modification::Result, Key>,
|
||||
Key: Clone,
|
||||
> Clone for PersistentSegmentTreeVersion<Modification, Initializer, Key>
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
root: self.root.clone(),
|
||||
all_keys: self.all_keys.clone(),
|
||||
initializer: self.initializer.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get<
|
||||
Modification: RangeModification<Key>,
|
||||
Initializer: LazyRangeInitializer<Modification::Result, Key>,
|
||||
Key: MidpointableKey,
|
||||
>(
|
||||
node: &mut Rc<Node<Modification, Key>>,
|
||||
node_keys: &Range<Key>,
|
||||
initializer: &Initializer,
|
||||
keys: &Range<Key>,
|
||||
) -> Modification::Result {
|
||||
if node_keys.end <= keys.start || keys.end <= node_keys.start {
|
||||
return Modification::Result::new_for_empty_range();
|
||||
}
|
||||
if keys.start <= node_keys.start && node_keys.end <= keys.end {
|
||||
return node.result.clone();
|
||||
}
|
||||
let node = Rc::make_mut(node);
|
||||
let (left_keys, right_keys) = split_range(node_keys);
|
||||
node.force_children(initializer, &left_keys, &right_keys);
|
||||
let mut result = get(node.left.as_mut().unwrap(), &left_keys, initializer, keys);
|
||||
Modification::Result::add(
|
||||
&mut result,
|
||||
&left_keys,
|
||||
&get(node.right.as_mut().unwrap(), &right_keys, initializer, keys),
|
||||
&right_keys,
|
||||
);
|
||||
result
|
||||
}
|
||||
|
||||
fn modify<
|
||||
Modification: RangeModification<Key>,
|
||||
Initializer: LazyRangeInitializer<Modification::Result, Key>,
|
||||
Key: MidpointableKey,
|
||||
>(
|
||||
node: &mut Rc<Node<Modification, Key>>,
|
||||
node_keys: &Range<Key>,
|
||||
initializer: &Initializer,
|
||||
keys: &Range<Key>,
|
||||
modification: &Modification,
|
||||
) {
|
||||
if modification.is_no_op() || node_keys.end <= keys.start || keys.end <= node_keys.start {
|
||||
return;
|
||||
}
|
||||
let node = Rc::make_mut(node);
|
||||
if keys.start <= node_keys.start && node_keys.end <= keys.end {
|
||||
node.apply(modification, node_keys);
|
||||
return;
|
||||
}
|
||||
let (left_keys, right_keys) = split_range(node_keys);
|
||||
node.force_children(initializer, &left_keys, &right_keys);
|
||||
modify(
|
||||
node.left.as_mut().unwrap(),
|
||||
&left_keys,
|
||||
initializer,
|
||||
keys,
|
||||
&modification,
|
||||
);
|
||||
modify(
|
||||
node.right.as_mut().unwrap(),
|
||||
&right_keys,
|
||||
initializer,
|
||||
keys,
|
||||
&modification,
|
||||
);
|
||||
node.recalculate_from_children(&left_keys, &right_keys);
|
||||
}
|
||||
|
||||
impl<
|
||||
Modification: RangeModification<Key>,
|
||||
Initializer: LazyRangeInitializer<Modification::Result, Key>,
|
||||
Key: MidpointableKey,
|
||||
> VecReadableVersion<Modification, Key>
|
||||
for PersistentSegmentTreeVersion<Modification, Initializer, Key>
|
||||
{
|
||||
fn get(&self, keys: &Range<Key>) -> Modification::Result {
|
||||
get(
|
||||
&mut self.root.clone(), // TODO: do not always force a branch
|
||||
&self.all_keys,
|
||||
self.initializer.as_ref(),
|
||||
keys,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PersistentSegmentTree<
|
||||
Modification: RangeModification<Key>,
|
||||
Initializer: LazyRangeInitializer<Modification::Result, Key>,
|
||||
Key: MidpointableKey,
|
||||
>(PersistentSegmentTreeVersion<Modification, Initializer, Key>);
|
||||
|
||||
impl<
|
||||
Modification: RangeModification<Key>,
|
||||
Initializer: LazyRangeInitializer<Modification::Result, Key>,
|
||||
Key: MidpointableKey,
|
||||
> VecReadableVersion<Modification, Key>
|
||||
for PersistentSegmentTree<Modification, Initializer, Key>
|
||||
{
|
||||
fn get(&self, keys: &Range<Key>) -> Modification::Result {
|
||||
self.0.get(keys)
|
||||
}
|
||||
}
|
||||
|
||||
impl<
|
||||
Modification: RangeModification<Key>,
|
||||
Initializer: LazyRangeInitializer<Modification::Result, Key>,
|
||||
Key: MidpointableKey,
|
||||
> PersistentVecStorage<Modification, Initializer, Key>
|
||||
for PersistentSegmentTree<Modification, Initializer, Key>
|
||||
{
|
||||
fn new(all_keys: Range<Key>, initializer: Initializer) -> Self {
|
||||
PersistentSegmentTree(PersistentSegmentTreeVersion {
|
||||
root: Rc::new(Node::new(&all_keys, &initializer)),
|
||||
all_keys: all_keys,
|
||||
initializer: Rc::new(initializer),
|
||||
})
|
||||
}
|
||||
|
||||
type FrozenVersion = PersistentSegmentTreeVersion<Modification, Initializer, Key>;
|
||||
|
||||
fn modify(&mut self, keys: &Range<Key>, modification: &Modification) {
|
||||
modify(
|
||||
&mut self.0.root, // TODO: do not always force a branch
|
||||
&self.0.all_keys,
|
||||
self.0.initializer.as_ref(),
|
||||
keys,
|
||||
modification,
|
||||
)
|
||||
}
|
||||
|
||||
fn freeze(&mut self) -> Self::FrozenVersion {
|
||||
self.0.clone()
|
||||
}
|
||||
}
|
||||
295
libs/persistent_range_query/tests/layer_map_test.rs
Normal file
295
libs/persistent_range_query/tests/layer_map_test.rs
Normal file
@@ -0,0 +1,295 @@
|
||||
use persistent_range_query::naive::{IndexableKey, NaiveVecStorage};
|
||||
use persistent_range_query::ops::SameElementsInitializer;
|
||||
use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree};
|
||||
use persistent_range_query::{
|
||||
LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
|
||||
VecReadableVersion,
|
||||
};
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Range;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)]
|
||||
struct PageIndex(u32);
|
||||
type LayerId = String;
|
||||
|
||||
impl IndexableKey for PageIndex {
|
||||
fn index(all_keys: &Range<Self>, key: &Self) -> usize {
|
||||
(key.0 as usize) - (all_keys.start.0 as usize)
|
||||
}
|
||||
|
||||
fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
|
||||
PageIndex(all_keys.start.0 + index as u32)..PageIndex(all_keys.start.0 + index as u32 + 1)
|
||||
}
|
||||
}
|
||||
|
||||
impl MidpointableKey for PageIndex {
|
||||
fn midpoint(range: &Range<Self>) -> Self {
|
||||
PageIndex(range.start.0 + (range.end.0 - range.start.0) / 2)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
struct LayerMapInformation {
|
||||
// Only make sense for a range of length 1.
|
||||
last_layer: Option<LayerId>,
|
||||
last_image_layer: Option<LayerId>,
|
||||
// Work for all ranges
|
||||
max_delta_layers: (usize, Range<PageIndex>),
|
||||
}
|
||||
|
||||
impl LayerMapInformation {
|
||||
fn last_layers(&self) -> (&Option<LayerId>, &Option<LayerId>) {
|
||||
(&self.last_layer, &self.last_image_layer)
|
||||
}
|
||||
|
||||
fn max_delta_layers(&self) -> &(usize, Range<PageIndex>) {
|
||||
&self.max_delta_layers
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_ranges(left: &Range<PageIndex>, right: &Range<PageIndex>) -> Range<PageIndex> {
|
||||
if left.is_empty() {
|
||||
right.clone()
|
||||
} else if right.is_empty() {
|
||||
left.clone()
|
||||
} else if left.end == right.start {
|
||||
left.start..right.end
|
||||
} else {
|
||||
left.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl RangeQueryResult<PageIndex> for LayerMapInformation {
|
||||
fn new_for_empty_range() -> Self {
|
||||
LayerMapInformation {
|
||||
last_layer: None,
|
||||
last_image_layer: None,
|
||||
max_delta_layers: (0, PageIndex(0)..PageIndex(0)),
|
||||
}
|
||||
}
|
||||
|
||||
fn combine(
|
||||
left: &Self,
|
||||
_left_range: &Range<PageIndex>,
|
||||
right: &Self,
|
||||
_right_range: &Range<PageIndex>,
|
||||
) -> Self {
|
||||
// Note that either range may be empty.
|
||||
LayerMapInformation {
|
||||
last_layer: left
|
||||
.last_layer
|
||||
.as_ref()
|
||||
.or_else(|| right.last_layer.as_ref())
|
||||
.cloned(),
|
||||
last_image_layer: left
|
||||
.last_image_layer
|
||||
.as_ref()
|
||||
.or_else(|| right.last_image_layer.as_ref())
|
||||
.cloned(),
|
||||
max_delta_layers: match left.max_delta_layers.0.cmp(&right.max_delta_layers.0) {
|
||||
Ordering::Less => right.max_delta_layers.clone(),
|
||||
Ordering::Greater => left.max_delta_layers.clone(),
|
||||
Ordering::Equal => (
|
||||
left.max_delta_layers.0,
|
||||
merge_ranges(&left.max_delta_layers.1, &right.max_delta_layers.1),
|
||||
),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn add(
|
||||
left: &mut Self,
|
||||
left_range: &Range<PageIndex>,
|
||||
right: &Self,
|
||||
right_range: &Range<PageIndex>,
|
||||
) {
|
||||
*left = Self::combine(&left, left_range, right, right_range);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct AddDeltaLayers {
|
||||
last_layer: LayerId,
|
||||
count: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct LayerMapModification {
|
||||
add_image_layer: Option<LayerId>,
|
||||
add_delta_layers: Option<AddDeltaLayers>,
|
||||
}
|
||||
|
||||
impl LayerMapModification {
|
||||
fn add_image_layer(layer: impl Into<LayerId>) -> Self {
|
||||
LayerMapModification {
|
||||
add_image_layer: Some(layer.into()),
|
||||
add_delta_layers: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn add_delta_layer(layer: impl Into<LayerId>) -> Self {
|
||||
LayerMapModification {
|
||||
add_image_layer: None,
|
||||
add_delta_layers: Some(AddDeltaLayers {
|
||||
last_layer: layer.into(),
|
||||
count: 1,
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RangeModification<PageIndex> for LayerMapModification {
|
||||
type Result = LayerMapInformation;
|
||||
|
||||
fn no_op() -> Self {
|
||||
LayerMapModification {
|
||||
add_image_layer: None,
|
||||
add_delta_layers: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn is_no_op(&self) -> bool {
|
||||
self.add_image_layer.is_none() && self.add_delta_layers.is_none()
|
||||
}
|
||||
|
||||
fn is_reinitialization(&self) -> bool {
|
||||
self.add_image_layer.is_some()
|
||||
}
|
||||
|
||||
fn apply(&self, result: &mut Self::Result, range: &Range<PageIndex>) {
|
||||
if let Some(layer) = &self.add_image_layer {
|
||||
result.last_layer = Some(layer.clone());
|
||||
result.last_image_layer = Some(layer.clone());
|
||||
result.max_delta_layers = (0, range.clone());
|
||||
}
|
||||
if let Some(AddDeltaLayers { last_layer, count }) = &self.add_delta_layers {
|
||||
result.last_layer = Some(last_layer.clone());
|
||||
result.max_delta_layers.0 += count;
|
||||
}
|
||||
}
|
||||
|
||||
fn compose(later: &Self, earlier: &mut Self) {
|
||||
if later.add_image_layer.is_some() {
|
||||
*earlier = later.clone();
|
||||
return;
|
||||
}
|
||||
if let Some(AddDeltaLayers { last_layer, count }) = &later.add_delta_layers {
|
||||
let res = earlier.add_delta_layers.get_or_insert(AddDeltaLayers {
|
||||
last_layer: LayerId::default(),
|
||||
count: 0,
|
||||
});
|
||||
res.last_layer = last_layer.clone();
|
||||
res.count += count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LazyRangeInitializer<LayerMapInformation, PageIndex> for SameElementsInitializer<()> {
|
||||
fn get(&self, range: &Range<PageIndex>) -> LayerMapInformation {
|
||||
LayerMapInformation {
|
||||
last_layer: None,
|
||||
last_image_layer: None,
|
||||
max_delta_layers: (0, range.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn test_layer_map<
|
||||
S: PersistentVecStorage<LayerMapModification, SameElementsInitializer<()>, PageIndex>,
|
||||
>() {
|
||||
let mut s = S::new(
|
||||
PageIndex(0)..PageIndex(100),
|
||||
SameElementsInitializer::new(()),
|
||||
);
|
||||
s.modify(
|
||||
&(PageIndex(0)..PageIndex(70)),
|
||||
&LayerMapModification::add_image_layer("Img0..70"),
|
||||
);
|
||||
s.modify(
|
||||
&(PageIndex(50)..PageIndex(100)),
|
||||
&LayerMapModification::add_image_layer("Img50..100"),
|
||||
);
|
||||
s.modify(
|
||||
&(PageIndex(10)..PageIndex(60)),
|
||||
&LayerMapModification::add_delta_layer("Delta10..60"),
|
||||
);
|
||||
let s_before_last_delta = s.freeze();
|
||||
s.modify(
|
||||
&(PageIndex(20)..PageIndex(80)),
|
||||
&LayerMapModification::add_delta_layer("Delta20..80"),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
s.get(&(PageIndex(5)..PageIndex(6))).last_layers(),
|
||||
(&Some("Img0..70".to_owned()), &Some("Img0..70".to_owned()))
|
||||
);
|
||||
assert_eq!(
|
||||
s.get(&(PageIndex(15)..PageIndex(16))).last_layers(),
|
||||
(
|
||||
&Some("Delta10..60".to_owned()),
|
||||
&Some("Img0..70".to_owned())
|
||||
)
|
||||
);
|
||||
assert_eq!(
|
||||
s.get(&(PageIndex(25)..PageIndex(26))).last_layers(),
|
||||
(
|
||||
&Some("Delta20..80".to_owned()),
|
||||
&Some("Img0..70".to_owned())
|
||||
)
|
||||
);
|
||||
assert_eq!(
|
||||
s.get(&(PageIndex(65)..PageIndex(66))).last_layers(),
|
||||
(
|
||||
&Some("Delta20..80".to_owned()),
|
||||
&Some("Img50..100".to_owned())
|
||||
)
|
||||
);
|
||||
assert_eq!(
|
||||
s.get(&(PageIndex(95)..PageIndex(96))).last_layers(),
|
||||
(
|
||||
&Some("Img50..100".to_owned()),
|
||||
&Some("Img50..100".to_owned())
|
||||
)
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
s.get(&(PageIndex(0)..PageIndex(100))).max_delta_layers(),
|
||||
&(2, PageIndex(20)..PageIndex(60)),
|
||||
);
|
||||
assert_eq!(
|
||||
*s_before_last_delta
|
||||
.get(&(PageIndex(0)..PageIndex(100)))
|
||||
.max_delta_layers(),
|
||||
(1, PageIndex(10)..PageIndex(60)),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
*s.get(&(PageIndex(10)..PageIndex(30))).max_delta_layers(),
|
||||
(2, PageIndex(20)..PageIndex(30))
|
||||
);
|
||||
assert_eq!(
|
||||
*s.get(&(PageIndex(10)..PageIndex(20))).max_delta_layers(),
|
||||
(1, PageIndex(10)..PageIndex(20))
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
*s.get(&(PageIndex(70)..PageIndex(80))).max_delta_layers(),
|
||||
(1, PageIndex(70)..PageIndex(80))
|
||||
);
|
||||
assert_eq!(
|
||||
*s_before_last_delta
|
||||
.get(&(PageIndex(70)..PageIndex(80)))
|
||||
.max_delta_layers(),
|
||||
(0, PageIndex(70)..PageIndex(80))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_naive() {
|
||||
test_layer_map::<NaiveVecStorage<_, _, _>>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_tree() {
|
||||
test_layer_map::<PersistentSegmentTree<_, _, _>>();
|
||||
}
|
||||
116
libs/persistent_range_query/tests/rsq_test.rs
Normal file
116
libs/persistent_range_query/tests/rsq_test.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
use persistent_range_query::naive::*;
|
||||
use persistent_range_query::ops::rsq::AddAssignModification::Add;
|
||||
use persistent_range_query::ops::rsq::*;
|
||||
use persistent_range_query::ops::SameElementsInitializer;
|
||||
use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree};
|
||||
use persistent_range_query::{PersistentVecStorage, VecReadableVersion};
|
||||
use rand::{Rng, SeedableRng};
|
||||
use std::ops::Range;
|
||||
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
|
||||
struct K(u16);
|
||||
|
||||
impl IndexableKey for K {
|
||||
fn index(all_keys: &Range<Self>, key: &Self) -> usize {
|
||||
(key.0 as usize) - (all_keys.start.0 as usize)
|
||||
}
|
||||
|
||||
fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
|
||||
K(all_keys.start.0 + index as u16)..K(all_keys.start.0 + index as u16 + 1)
|
||||
}
|
||||
}
|
||||
|
||||
impl SumOfSameElements<K> for i32 {
|
||||
fn sum(initial_element_value: &Self, keys: &Range<K>) -> Self {
|
||||
initial_element_value * (keys.end.0 - keys.start.0) as Self
|
||||
}
|
||||
}
|
||||
|
||||
impl MidpointableKey for K {
|
||||
fn midpoint(range: &Range<Self>) -> Self {
|
||||
K(range.start.0 + (range.end.0 - range.start.0) / 2)
|
||||
}
|
||||
}
|
||||
|
||||
fn test_storage<
|
||||
S: PersistentVecStorage<AddAssignModification<i32>, SameElementsInitializer<i32>, K>,
|
||||
>() {
|
||||
let mut s = S::new(K(0)..K(12), SameElementsInitializer::new(0i32));
|
||||
assert_eq!(*s.get(&(K(0)..K(12))).sum(), 0);
|
||||
|
||||
s.modify(&(K(2)..K(5)), &AddAssignModification::Add(3));
|
||||
assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 3 + 3);
|
||||
let s_old = s.freeze();
|
||||
|
||||
s.modify(&(K(3)..K(6)), &AddAssignModification::Assign(10));
|
||||
assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 10 + 10 + 10);
|
||||
|
||||
s.modify(&(K(4)..K(7)), &AddAssignModification::Add(2));
|
||||
assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 10 + 12 + 12 + 2);
|
||||
|
||||
assert_eq!(*s.get(&(K(4)..K(6))).sum(), 12 + 12);
|
||||
assert_eq!(*s_old.get(&(K(4)..K(6))).sum(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_naive() {
|
||||
test_storage::<NaiveVecStorage<_, _, _>>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_tree() {
|
||||
test_storage::<PersistentSegmentTree<_, _, _>>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stress() {
|
||||
const LEN: u16 = 17_238;
|
||||
const OPERATIONS: i32 = 20_000;
|
||||
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(0);
|
||||
let mut naive: NaiveVecStorage<AddAssignModification<i32>, _, _> =
|
||||
NaiveVecStorage::new(K(0)..K(LEN), SameElementsInitializer::new(2i32));
|
||||
let mut segm_tree: PersistentSegmentTree<AddAssignModification<i32>, _, _> =
|
||||
PersistentSegmentTree::new(K(0)..K(LEN), SameElementsInitializer::new(2i32));
|
||||
|
||||
fn gen_range(rng: &mut impl Rng) -> Range<K> {
|
||||
let l: u16 = rng.gen_range(0..LEN);
|
||||
let r: u16 = rng.gen_range(0..LEN);
|
||||
if l <= r {
|
||||
K(l)..K(r)
|
||||
} else {
|
||||
K(r)..K(l)
|
||||
}
|
||||
}
|
||||
|
||||
for _ in 0..2 {
|
||||
let checksum_range = gen_range(&mut rng);
|
||||
let checksum_before: i32 = *naive.get(&checksum_range).sum();
|
||||
assert_eq!(checksum_before, *segm_tree.get(&checksum_range).sum());
|
||||
|
||||
let naive_before = naive.freeze();
|
||||
let segm_tree_before = segm_tree.freeze();
|
||||
assert_eq!(checksum_before, *naive_before.get(&checksum_range).sum());
|
||||
assert_eq!(checksum_before, *segm_tree.get(&checksum_range).sum());
|
||||
|
||||
for _ in 0..OPERATIONS {
|
||||
{
|
||||
let range = gen_range(&mut rng);
|
||||
assert_eq!(naive.get(&range).sum(), segm_tree.get(&range).sum());
|
||||
}
|
||||
{
|
||||
let range = gen_range(&mut rng);
|
||||
let val = rng.gen_range(-10i32..=10i32);
|
||||
let op = Add(val);
|
||||
naive.modify(&range, &op);
|
||||
segm_tree.modify(&range, &op);
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(checksum_before, *naive_before.get(&checksum_range).sum());
|
||||
assert_eq!(
|
||||
checksum_before,
|
||||
*segm_tree_before.get(&checksum_range).sum()
|
||||
);
|
||||
}
|
||||
}
|
||||
16
libs/pq_proto/Cargo.toml
Normal file
16
libs/pq_proto/Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[package]
|
||||
name = "pq_proto"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
bytes = "1.0.1"
|
||||
pin-project-lite = "0.2.7"
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
rand = "0.8.3"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tracing = "0.1"
|
||||
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
@@ -2,7 +2,9 @@
|
||||
//! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
|
||||
//! on message formats.
|
||||
|
||||
use crate::sync::{AsyncishRead, SyncFuture};
|
||||
// Tools for calling certain async methods in sync contexts.
|
||||
pub mod sync;
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use postgres_protocol::PG_EPOCH;
|
||||
@@ -16,6 +18,7 @@ use std::{
|
||||
str,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
use sync::{AsyncishRead, SyncFuture};
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tracing::{trace, warn};
|
||||
|
||||
@@ -198,7 +201,7 @@ impl FeMessage {
|
||||
///
|
||||
/// ```
|
||||
/// # use std::io;
|
||||
/// # use utils::pq_proto::FeMessage;
|
||||
/// # use pq_proto::FeMessage;
|
||||
/// #
|
||||
/// # fn process_message(msg: FeMessage) -> anyhow::Result<()> {
|
||||
/// # Ok(())
|
||||
@@ -302,6 +305,7 @@ impl FeStartupPacket {
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
#[allow(clippy::manual_range_contains)]
|
||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
bail!("invalid message length");
|
||||
}
|
||||
@@ -29,7 +29,7 @@ impl<S, T: Future> SyncFuture<S, T> {
|
||||
/// Example:
|
||||
///
|
||||
/// ```
|
||||
/// # use utils::sync::SyncFuture;
|
||||
/// # use pq_proto::sync::SyncFuture;
|
||||
/// # use std::future::Future;
|
||||
/// # use tokio::io::AsyncReadExt;
|
||||
/// #
|
||||
3
libs/tenant_size_model/.gitignore
vendored
Normal file
3
libs/tenant_size_model/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
*.dot
|
||||
*.png
|
||||
*.svg
|
||||
8
libs/tenant_size_model/Cargo.toml
Normal file
8
libs/tenant_size_model/Cargo.toml
Normal file
@@ -0,0 +1,8 @@
|
||||
[package]
|
||||
name = "tenant_size_model"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
13
libs/tenant_size_model/Makefile
Normal file
13
libs/tenant_size_model/Makefile
Normal file
@@ -0,0 +1,13 @@
|
||||
all: 1.svg 2.svg 3.svg 4.svg 1.png 2.png 3.png 4.png
|
||||
|
||||
../../target/debug/tenant_size_model: Cargo.toml src/main.rs src/lib.rs
|
||||
cargo build --bin tenant_size_model
|
||||
|
||||
%.svg: %.dot
|
||||
dot -Tsvg $< > $@
|
||||
|
||||
%.png: %.dot
|
||||
dot -Tpng $< > $@
|
||||
|
||||
%.dot: ../../target/debug/tenant_size_model
|
||||
../../target/debug/tenant_size_model $* > $@
|
||||
7
libs/tenant_size_model/README.md
Normal file
7
libs/tenant_size_model/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Logical size + WAL pricing
|
||||
|
||||
This is a simulator to calculate the tenant size in different scenarios,
|
||||
using the "Logical size + WAL" method. Makefile produces diagrams used in a
|
||||
private presentation:
|
||||
|
||||
https://docs.google.com/presentation/d/1OapE4k11xmcwMh7I7YvNWGC63yCRLh6udO9bXZ-fZmo/edit?usp=sharing
|
||||
382
libs/tenant_size_model/src/lib.rs
Normal file
382
libs/tenant_size_model/src/lib.rs
Normal file
@@ -0,0 +1,382 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Pricing model or history size builder.
|
||||
///
|
||||
/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
|
||||
/// type.
|
||||
pub struct Storage<K: 'static> {
|
||||
segments: Vec<Segment>,
|
||||
|
||||
/// Mapping from the branch name to the index of a segment describing it's latest state.
|
||||
branches: HashMap<K, usize>,
|
||||
}
|
||||
|
||||
/// Snapshot of a branch.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub struct Segment {
|
||||
/// Previous segment index into ['Storage::segments`], if any.
|
||||
parent: Option<usize>,
|
||||
|
||||
/// Description of how did we get to this state.
|
||||
///
|
||||
/// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
|
||||
/// modifying a branch directly.
|
||||
pub op: Cow<'static, str>,
|
||||
|
||||
/// LSN before this state
|
||||
start_lsn: u64,
|
||||
|
||||
/// LSN at this state
|
||||
pub end_lsn: u64,
|
||||
|
||||
/// Logical size before this state
|
||||
start_size: u64,
|
||||
|
||||
/// Logical size at this state. Can be None in the last Segment of a branch.
|
||||
pub end_size: Option<u64>,
|
||||
|
||||
/// Indices to [`Storage::segments`]
|
||||
///
|
||||
/// FIXME: this could be an Option<usize>
|
||||
children_after: Vec<usize>,
|
||||
|
||||
/// Determined by `retention_period` given to [`Storage::calculate`]
|
||||
pub needed: bool,
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
// *-g--*---D--->
|
||||
// /
|
||||
// /
|
||||
// / *---b----*-B--->
|
||||
// / /
|
||||
// / /
|
||||
// -----*--e---*-----f----* C
|
||||
// E \
|
||||
// \
|
||||
// *--a---*---A-->
|
||||
//
|
||||
// If A and B need to be retained, is it cheaper to store
|
||||
// snapshot at C+a+b, or snapshots at A and B ?
|
||||
//
|
||||
// If D also needs to be retained, which is cheaper:
|
||||
//
|
||||
// 1. E+g+e+f+a+b
|
||||
// 2. D+C+a+b
|
||||
// 3. D+A+B
|
||||
|
||||
/// [`Segment`] which has had it's size calculated.
|
||||
pub struct SegmentSize {
|
||||
pub seg_id: usize,
|
||||
|
||||
pub method: SegmentMethod,
|
||||
|
||||
this_size: u64,
|
||||
|
||||
pub children: Vec<SegmentSize>,
|
||||
}
|
||||
|
||||
impl SegmentSize {
|
||||
fn total(&self) -> u64 {
|
||||
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
}
|
||||
|
||||
pub fn total_children(&self) -> u64 {
|
||||
if self.method == SnapshotAfter {
|
||||
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
} else {
|
||||
self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Different methods to retain history from a particular state
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub enum SegmentMethod {
|
||||
SnapshotAfter,
|
||||
Wal,
|
||||
WalNeeded,
|
||||
Skipped,
|
||||
}
|
||||
|
||||
use SegmentMethod::*;
|
||||
|
||||
impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
/// Creates a new storage with the given default branch name.
|
||||
pub fn new(initial_branch: K) -> Storage<K> {
|
||||
let init_segment = Segment {
|
||||
op: "".into(),
|
||||
needed: false,
|
||||
parent: None,
|
||||
start_lsn: 0,
|
||||
end_lsn: 0,
|
||||
start_size: 0,
|
||||
end_size: Some(0),
|
||||
children_after: Vec::new(),
|
||||
};
|
||||
|
||||
Storage {
|
||||
segments: vec![init_segment],
|
||||
branches: HashMap::from([(initial_branch, 0)]),
|
||||
}
|
||||
}
|
||||
|
||||
/// Advances the branch with a new point, at given LSN.
|
||||
pub fn insert_point<Q: ?Sized>(
|
||||
&mut self,
|
||||
branch: &Q,
|
||||
op: Cow<'static, str>,
|
||||
lsn: u64,
|
||||
size: Option<u64>,
|
||||
) where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
{
|
||||
let lastseg_id = *self.branches.get(branch).unwrap();
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
assert!(lsn > lastseg.end_lsn);
|
||||
|
||||
let newseg = Segment {
|
||||
op,
|
||||
parent: Some(lastseg_id),
|
||||
start_lsn: lastseg.end_lsn,
|
||||
end_lsn: lsn,
|
||||
start_size: lastseg.end_size.unwrap(),
|
||||
end_size: size,
|
||||
children_after: Vec::new(),
|
||||
needed: false,
|
||||
};
|
||||
lastseg.children_after.push(newseg_id);
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
}
|
||||
|
||||
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
||||
pub fn modify_branch<Q: ?Sized>(
|
||||
&mut self,
|
||||
branch: &Q,
|
||||
op: Cow<'static, str>,
|
||||
lsn_bytes: u64,
|
||||
size_bytes: i64,
|
||||
) where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
{
|
||||
let lastseg_id = *self.branches.get(branch).unwrap();
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
let newseg = Segment {
|
||||
op,
|
||||
parent: Some(lastseg_id),
|
||||
start_lsn: lastseg.end_lsn,
|
||||
end_lsn: lastseg.end_lsn + lsn_bytes,
|
||||
start_size: lastseg.end_size.unwrap(),
|
||||
end_size: Some((lastseg.end_size.unwrap() as i64 + size_bytes) as u64),
|
||||
children_after: Vec::new(),
|
||||
needed: false,
|
||||
};
|
||||
lastseg.children_after.push(newseg_id);
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
}
|
||||
|
||||
pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
{
|
||||
self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
|
||||
}
|
||||
|
||||
pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
{
|
||||
self.modify_branch(branch, "update".into(), bytes, 0i64);
|
||||
}
|
||||
|
||||
pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
{
|
||||
self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
|
||||
}
|
||||
|
||||
/// Panics if the parent branch cannot be found.
|
||||
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K)
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
{
|
||||
// Find the right segment
|
||||
let branchseg_id = *self
|
||||
.branches
|
||||
.get(parent)
|
||||
.expect("should had found the parent by key");
|
||||
let _branchseg = &mut self.segments[branchseg_id];
|
||||
|
||||
// Create branch name for it
|
||||
self.branches.insert(name, branchseg_id);
|
||||
}
|
||||
|
||||
pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
|
||||
// Phase 1: Mark all the segments that need to be retained
|
||||
for (_branch, &last_seg_id) in self.branches.iter() {
|
||||
let last_seg = &self.segments[last_seg_id];
|
||||
let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
|
||||
let mut seg_id = last_seg_id;
|
||||
loop {
|
||||
let seg = &mut self.segments[seg_id];
|
||||
if seg.end_lsn < cutoff_lsn {
|
||||
break;
|
||||
}
|
||||
seg.needed = true;
|
||||
if let Some(prev_seg_id) = seg.parent {
|
||||
seg_id = prev_seg_id;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: For each oldest segment in a chain that needs to be retained,
|
||||
// calculate if we should store snapshot or WAL
|
||||
self.size_from_snapshot_later(0)
|
||||
}
|
||||
|
||||
fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
|
||||
let seg = &self.segments[seg_id];
|
||||
|
||||
let this_size = seg.end_lsn - seg.start_lsn;
|
||||
|
||||
let mut children = Vec::new();
|
||||
|
||||
// try both ways
|
||||
for &child_id in seg.children_after.iter() {
|
||||
// try each child both ways
|
||||
let child = &self.segments[child_id];
|
||||
let p1 = self.size_from_wal(child_id);
|
||||
|
||||
let p = if !child.needed {
|
||||
let p2 = self.size_from_snapshot_later(child_id);
|
||||
if p1.total() < p2.total() {
|
||||
p1
|
||||
} else {
|
||||
p2
|
||||
}
|
||||
} else {
|
||||
p1
|
||||
};
|
||||
children.push(p);
|
||||
}
|
||||
SegmentSize {
|
||||
seg_id,
|
||||
method: if seg.needed { WalNeeded } else { Wal },
|
||||
this_size,
|
||||
children,
|
||||
}
|
||||
}
|
||||
|
||||
fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
|
||||
// If this is needed, then it's time to do the snapshot and continue
|
||||
// with wal method.
|
||||
let seg = &self.segments[seg_id];
|
||||
//eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
|
||||
if seg.needed {
|
||||
let mut children = Vec::new();
|
||||
|
||||
for &child_id in seg.children_after.iter() {
|
||||
// try each child both ways
|
||||
let child = &self.segments[child_id];
|
||||
let p1 = self.size_from_wal(child_id);
|
||||
|
||||
let p = if !child.needed {
|
||||
let p2 = self.size_from_snapshot_later(child_id);
|
||||
if p1.total() < p2.total() {
|
||||
p1
|
||||
} else {
|
||||
p2
|
||||
}
|
||||
} else {
|
||||
p1
|
||||
};
|
||||
children.push(p);
|
||||
}
|
||||
SegmentSize {
|
||||
seg_id,
|
||||
method: WalNeeded,
|
||||
this_size: seg.start_size,
|
||||
children,
|
||||
}
|
||||
} else {
|
||||
// If any of the direct children are "needed", need to be able to reconstruct here
|
||||
let mut children_needed = false;
|
||||
for &child in seg.children_after.iter() {
|
||||
let seg = &self.segments[child];
|
||||
if seg.needed {
|
||||
children_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let method1 = if !children_needed {
|
||||
let mut children = Vec::new();
|
||||
for child in seg.children_after.iter() {
|
||||
children.push(self.size_from_snapshot_later(*child));
|
||||
}
|
||||
Some(SegmentSize {
|
||||
seg_id,
|
||||
method: Skipped,
|
||||
this_size: 0,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// If this a junction, consider snapshotting here
|
||||
let method2 = if children_needed || seg.children_after.len() >= 2 {
|
||||
let mut children = Vec::new();
|
||||
for child in seg.children_after.iter() {
|
||||
children.push(self.size_from_wal(*child));
|
||||
}
|
||||
Some(SegmentSize {
|
||||
seg_id,
|
||||
method: SnapshotAfter,
|
||||
this_size: seg.end_size.unwrap(),
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
match (method1, method2) {
|
||||
(None, None) => panic!(),
|
||||
(Some(method), None) => method,
|
||||
(None, Some(method)) => method,
|
||||
(Some(method1), Some(method2)) => {
|
||||
if method1.total() < method2.total() {
|
||||
method1
|
||||
} else {
|
||||
method2
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_segments(self) -> Vec<Segment> {
|
||||
self.segments
|
||||
}
|
||||
}
|
||||
268
libs/tenant_size_model/src/main.rs
Normal file
268
libs/tenant_size_model/src/main.rs
Normal file
@@ -0,0 +1,268 @@
|
||||
//! Tenant size model testing ground.
|
||||
//!
|
||||
//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
|
||||
//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
|
||||
//! into pngs.
|
||||
|
||||
use tenant_size_model::{Segment, SegmentSize, Storage};
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
fn scenario_1() -> (Vec<Segment>, SegmentSize) {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000);
|
||||
|
||||
(storage.into_segments(), size)
|
||||
}
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
fn scenario_2() -> (Vec<Segment>, SegmentSize) {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child");
|
||||
storage.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
storage.update("main", 1_000);
|
||||
|
||||
let size = storage.calculate(1000);
|
||||
|
||||
(storage.into_segments(), size)
|
||||
}
|
||||
|
||||
// Like 2, but more updates on main
|
||||
fn scenario_3() -> (Vec<Segment>, SegmentSize) {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child");
|
||||
storage.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000);
|
||||
|
||||
(storage.into_segments(), size)
|
||||
}
|
||||
|
||||
// Diverged branches
|
||||
fn scenario_4() -> (Vec<Segment>, SegmentSize) {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child");
|
||||
storage.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..8 {
|
||||
storage.update("main", 1_000);
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000);
|
||||
|
||||
(storage.into_segments(), size)
|
||||
}
|
||||
|
||||
fn scenario_5() -> (Vec<Segment>, SegmentSize) {
|
||||
let mut storage = Storage::new("a");
|
||||
storage.insert("a", 5000);
|
||||
storage.branch("a", "b");
|
||||
storage.update("b", 4000);
|
||||
storage.update("a", 2000);
|
||||
storage.branch("a", "c");
|
||||
storage.insert("c", 4000);
|
||||
storage.insert("a", 2000);
|
||||
|
||||
let size = storage.calculate(5000);
|
||||
|
||||
(storage.into_segments(), size)
|
||||
}
|
||||
|
||||
fn scenario_6() -> (Vec<Segment>, SegmentSize) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
const NO_OP: Cow<'static, str> = Cow::Borrowed("");
|
||||
|
||||
let branches = [
|
||||
Some(0x7ff1edab8182025f15ae33482edb590a_u128),
|
||||
Some(0xb1719e044db05401a05a2ed588a3ad3f),
|
||||
Some(0xb68d6691c895ad0a70809470020929ef),
|
||||
];
|
||||
|
||||
// compared to other scenarios, this one uses bytes instead of kB
|
||||
|
||||
let mut storage = Storage::new(None);
|
||||
|
||||
storage.branch(&None, branches[0]); // at 0
|
||||
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
|
||||
storage.branch(&branches[0], branches[1]); // at 108951064
|
||||
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
|
||||
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
|
||||
storage.branch(&branches[0], branches[2]); // at 283415424
|
||||
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
|
||||
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
|
||||
|
||||
let size = storage.calculate(100_000);
|
||||
|
||||
(storage.into_segments(), size)
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args: Vec<String> = std::env::args().collect();
|
||||
|
||||
let scenario = if args.len() < 2 { "1" } else { &args[1] };
|
||||
|
||||
let (segments, size) = match scenario {
|
||||
"1" => scenario_1(),
|
||||
"2" => scenario_2(),
|
||||
"3" => scenario_3(),
|
||||
"4" => scenario_4(),
|
||||
"5" => scenario_5(),
|
||||
"6" => scenario_6(),
|
||||
other => {
|
||||
eprintln!("invalid scenario {}", other);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
graphviz_tree(&segments, &size);
|
||||
}
|
||||
|
||||
fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
|
||||
use tenant_size_model::SegmentMethod::*;
|
||||
|
||||
let seg_id = node.seg_id;
|
||||
let seg = segments.get(seg_id).unwrap();
|
||||
let lsn = seg.end_lsn;
|
||||
let size = seg.end_size.unwrap_or(0);
|
||||
let method = node.method;
|
||||
|
||||
println!(" {{");
|
||||
println!(" node [width=0.1 height=0.1 shape=oval]");
|
||||
|
||||
let tenant_size = node.total_children();
|
||||
|
||||
let penwidth = if seg.needed { 6 } else { 3 };
|
||||
let x = match method {
|
||||
SnapshotAfter =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
|
||||
Wal =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
|
||||
WalNeeded =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
|
||||
Skipped =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
|
||||
};
|
||||
|
||||
println!(" \"seg{seg_id}\" [{x}]");
|
||||
println!(" }}");
|
||||
|
||||
// Recurse. Much of the data is actually on the edge
|
||||
for child in node.children.iter() {
|
||||
let child_id = child.seg_id;
|
||||
graphviz_recurse(segments, child);
|
||||
|
||||
let edge_color = match child.method {
|
||||
SnapshotAfter => "gray",
|
||||
Wal => "black",
|
||||
WalNeeded => "black",
|
||||
Skipped => "gray",
|
||||
};
|
||||
|
||||
println!(" {{");
|
||||
println!(" edge [] ");
|
||||
print!(" \"seg{seg_id}\" -> \"seg{child_id}\" [");
|
||||
print!("color={edge_color}");
|
||||
if child.method == WalNeeded {
|
||||
print!(" penwidth=6");
|
||||
}
|
||||
if child.method == Wal {
|
||||
print!(" penwidth=3");
|
||||
}
|
||||
|
||||
let next = segments.get(child_id).unwrap();
|
||||
|
||||
if next.op.is_empty() {
|
||||
print!(
|
||||
" label=\"{} / {}\"",
|
||||
next.end_lsn - seg.end_lsn,
|
||||
(next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
|
||||
);
|
||||
} else {
|
||||
print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
|
||||
}
|
||||
println!("]");
|
||||
println!(" }}");
|
||||
}
|
||||
}
|
||||
|
||||
fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
|
||||
println!("digraph G {{");
|
||||
println!(" fontname=\"Helvetica,Arial,sans-serif\"");
|
||||
println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]");
|
||||
println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]");
|
||||
println!(" graph [center=1 rankdir=LR]");
|
||||
println!(" edge [dir=none]");
|
||||
|
||||
graphviz_recurse(segments, tree);
|
||||
|
||||
println!("}}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scenarios_return_same_size() {
|
||||
type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
|
||||
let truths: &[(u32, ScenarioFn, _)] = &[
|
||||
(line!(), scenario_1, 8000),
|
||||
(line!(), scenario_2, 9000),
|
||||
(line!(), scenario_3, 13000),
|
||||
(line!(), scenario_4, 16000),
|
||||
(line!(), scenario_5, 17000),
|
||||
(line!(), scenario_6, 333_792_000),
|
||||
];
|
||||
|
||||
for (line, scenario, expected) in truths {
|
||||
let (_, size) = scenario();
|
||||
assert_eq!(*expected, size.total_children(), "scenario on line {line}");
|
||||
}
|
||||
}
|
||||
@@ -9,9 +9,6 @@ anyhow = "1.0"
|
||||
bincode = "1.3"
|
||||
bytes = "1.0.1"
|
||||
hyper = { version = "0.14.7", features = ["full"] }
|
||||
pin-project-lite = "0.2.7"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
routerify = "3"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
@@ -33,8 +30,8 @@ once_cell = "1.13.0"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
|
||||
|
||||
metrics = { path = "../metrics" }
|
||||
pq_proto = { path = "../pq_proto" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
use postgres::Config;
|
||||
|
||||
pub fn connection_host_port(config: &Config) -> (String, u16) {
|
||||
assert_eq!(
|
||||
config.get_hosts().len(),
|
||||
1,
|
||||
"only one pair of host and port is supported in connection string"
|
||||
);
|
||||
assert_eq!(
|
||||
config.get_ports().len(),
|
||||
1,
|
||||
"only one pair of host and port is supported in connection string"
|
||||
);
|
||||
let host = match &config.get_hosts()[0] {
|
||||
postgres::config::Host::Tcp(host) => host.as_ref(),
|
||||
postgres::config::Host::Unix(host) => host.to_str().unwrap(),
|
||||
};
|
||||
(host.to_owned(), config.get_ports()[0])
|
||||
}
|
||||
|
||||
pub fn connection_address(config: &Config) -> String {
|
||||
let (host, port) = connection_host_port(config);
|
||||
format!("{}:{}", host, port)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_connection_host_port() {
|
||||
let config: Config = "postgresql://no_user@localhost:64000/no_db"
|
||||
.parse()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
connection_host_port(&config),
|
||||
("localhost".to_owned(), 64000)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "only one pair of host and port is supported in connection string")]
|
||||
fn test_connection_host_port_multiple_ports() {
|
||||
let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db"
|
||||
.parse()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
connection_host_port(&config),
|
||||
("localhost".to_owned(), 64000)
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -204,6 +204,17 @@ pub struct TenantId(Id);
|
||||
|
||||
id_newtype!(TenantId);
|
||||
|
||||
/// Neon Connection Id identifies long-lived connections (for example a pagestream
|
||||
/// connection with the page_service). Is used for better logging and tracing
|
||||
///
|
||||
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
|
||||
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
|
||||
/// See [`Id`] for alternative ways to serialize it.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
|
||||
pub struct ConnectionId(Id);
|
||||
|
||||
id_newtype!(ConnectionId);
|
||||
|
||||
// A pair uniquely identifying Neon instance.
|
||||
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct TenantTimelineId {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
//! `utils` is intended to be a place to put code that is shared
|
||||
//! between other crates in this repository.
|
||||
|
||||
#![allow(clippy::manual_range_contains)]
|
||||
|
||||
/// `Lsn` type implements common tasks on Log Sequence Numbers
|
||||
pub mod lsn;
|
||||
/// SeqWait allows waiting for a future sequence number to arrive
|
||||
@@ -17,10 +15,6 @@ pub mod vec_map;
|
||||
pub mod bin_ser;
|
||||
pub mod postgres_backend;
|
||||
pub mod postgres_backend_async;
|
||||
pub mod pq_proto;
|
||||
|
||||
// dealing with connstring parsing and handy access to it's parts
|
||||
pub mod connstring;
|
||||
|
||||
// helper functions for creating and fsyncing
|
||||
pub mod crashsafe;
|
||||
@@ -39,13 +33,12 @@ pub mod sock_split;
|
||||
// common log initialisation routine
|
||||
pub mod logging;
|
||||
|
||||
pub mod lock_file;
|
||||
|
||||
// Misc
|
||||
pub mod accum;
|
||||
pub mod shutdown;
|
||||
|
||||
// Tools for calling certain async methods in sync contexts
|
||||
pub mod sync;
|
||||
|
||||
// Utility for binding TcpListeners with proper socket options.
|
||||
pub mod tcp_listener;
|
||||
|
||||
@@ -55,6 +48,25 @@ pub mod nonblock;
|
||||
// Default signal handling
|
||||
pub mod signals;
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
#[macro_export]
|
||||
macro_rules! failpoint_sleep_millis_async {
|
||||
($name:literal) => {{
|
||||
let should_sleep: Option<std::time::Duration> = (|| {
|
||||
fail::fail_point!($name, |v: Option<_>| {
|
||||
let millis = v.unwrap().parse::<u64>().unwrap();
|
||||
Some(Duration::from_millis(millis))
|
||||
});
|
||||
None
|
||||
})();
|
||||
if let Some(d) = should_sleep {
|
||||
tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
|
||||
tokio::time::sleep(d).await;
|
||||
tracing::info!("failpoint {:?}: sleep done", $name);
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
81
libs/utils/src/lock_file.rs
Normal file
81
libs/utils/src/lock_file.rs
Normal file
@@ -0,0 +1,81 @@
|
||||
//! A module to create and read lock files. A lock file ensures that only one
|
||||
//! process is running at a time, in a particular directory.
|
||||
//!
|
||||
//! File locking is done using [`fcntl::flock`], which means that holding the
|
||||
//! lock on file only prevents acquiring another lock on it; all other
|
||||
//! operations are still possible on files. Other process can still open, read,
|
||||
//! write, or remove the file, for example.
|
||||
//! If the file is removed while a process is holding a lock on it,
|
||||
//! the process that holds the lock does not get any error or notification.
|
||||
//! Furthermore, you can create a new file with the same name and lock the new file,
|
||||
//! while the old process is still running.
|
||||
//! Deleting the lock file while the locking process is still running is a bad idea!
|
||||
|
||||
use std::{fs, os::unix::prelude::AsRawFd, path::Path};
|
||||
|
||||
use anyhow::Context;
|
||||
use nix::fcntl;
|
||||
|
||||
use crate::crashsafe;
|
||||
|
||||
pub enum LockCreationResult {
|
||||
Created {
|
||||
new_lock_contents: String,
|
||||
file: fs::File,
|
||||
},
|
||||
AlreadyLocked {
|
||||
existing_lock_contents: String,
|
||||
},
|
||||
CreationFailed(anyhow::Error),
|
||||
}
|
||||
|
||||
/// Creates a lock file in the path given and writes the given contents into the file.
|
||||
/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
|
||||
pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
|
||||
let lock_file = match fs::OpenOptions::new()
|
||||
.create(true) // O_CREAT
|
||||
.write(true)
|
||||
.open(lock_file_path)
|
||||
.context("Failed to open lock file")
|
||||
{
|
||||
Ok(file) => file,
|
||||
Err(e) => return LockCreationResult::CreationFailed(e),
|
||||
};
|
||||
|
||||
match fcntl::flock(
|
||||
lock_file.as_raw_fd(),
|
||||
fcntl::FlockArg::LockExclusiveNonblock,
|
||||
) {
|
||||
Ok(()) => {
|
||||
match lock_file
|
||||
.set_len(0)
|
||||
.context("Failed to truncate lockfile")
|
||||
.and_then(|()| {
|
||||
fs::write(lock_file_path, &contents).with_context(|| {
|
||||
format!("Failed to write '{contents}' contents into lockfile")
|
||||
})
|
||||
})
|
||||
.and_then(|()| {
|
||||
crashsafe::fsync_file_and_parent(lock_file_path)
|
||||
.context("Failed to fsync lockfile")
|
||||
}) {
|
||||
Ok(()) => LockCreationResult::Created {
|
||||
new_lock_contents: contents,
|
||||
file: lock_file,
|
||||
},
|
||||
Err(e) => LockCreationResult::CreationFailed(e),
|
||||
}
|
||||
}
|
||||
Err(nix::errno::Errno::EAGAIN) => {
|
||||
match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
|
||||
Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
|
||||
existing_lock_contents,
|
||||
},
|
||||
Err(e) => LockCreationResult::CreationFailed(e),
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,6 @@
|
||||
use std::{
|
||||
fs::{File, OpenOptions},
|
||||
path::Path,
|
||||
str::FromStr,
|
||||
};
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::Context;
|
||||
use strum_macros::{EnumString, EnumVariantNames};
|
||||
|
||||
#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
|
||||
@@ -25,19 +21,8 @@ impl LogFormat {
|
||||
})
|
||||
}
|
||||
}
|
||||
pub fn init(
|
||||
log_filename: impl AsRef<Path>,
|
||||
daemonize: bool,
|
||||
log_format: LogFormat,
|
||||
) -> Result<File> {
|
||||
// Don't open the same file for output multiple times;
|
||||
// the different fds could overwrite each other's output.
|
||||
let log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?;
|
||||
|
||||
pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
|
||||
let default_filter_str = "info";
|
||||
|
||||
// We fall back to printing all spans at info-level or above if
|
||||
@@ -45,50 +30,16 @@ pub fn init(
|
||||
let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str));
|
||||
|
||||
let x: File = log_file.try_clone().unwrap();
|
||||
let base_logger = tracing_subscriber::fmt()
|
||||
.with_env_filter(env_filter)
|
||||
.with_target(false)
|
||||
.with_ansi(false)
|
||||
.with_writer(move || -> Box<dyn std::io::Write> {
|
||||
// we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
|
||||
// if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
|
||||
// for example to be in line with docker log command which expects logs comimg from stdout
|
||||
if daemonize {
|
||||
Box::new(x.try_clone().unwrap())
|
||||
} else {
|
||||
Box::new(std::io::stdout())
|
||||
}
|
||||
});
|
||||
.with_writer(std::io::stdout);
|
||||
|
||||
match log_format {
|
||||
LogFormat::Json => base_logger.json().init(),
|
||||
LogFormat::Plain => base_logger.init(),
|
||||
}
|
||||
|
||||
Ok(log_file)
|
||||
}
|
||||
|
||||
// #[cfg(test)]
|
||||
// Due to global logger, can't run tests in same process.
|
||||
// So until there's a non-global one, the tests are in ../tests/ as separate files.
|
||||
#[macro_export(local_inner_macros)]
|
||||
macro_rules! test_init_file_logger {
|
||||
($log_level:expr, $log_format:expr) => {{
|
||||
use std::str::FromStr;
|
||||
std::env::set_var("RUST_LOG", $log_level);
|
||||
|
||||
let tmp_dir = tempfile::TempDir::new().unwrap();
|
||||
let log_file_path = tmp_dir.path().join("logfile");
|
||||
|
||||
let log_format = $crate::logging::LogFormat::from_str($log_format).unwrap();
|
||||
let _log_file = $crate::logging::init(&log_file_path, true, log_format).unwrap();
|
||||
|
||||
let log_file = std::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&log_file_path)
|
||||
.unwrap();
|
||||
|
||||
log_file
|
||||
}};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::seqwait::MonotonicCounter;
|
||||
pub const XLOG_BLCKSZ: u32 = 8192;
|
||||
|
||||
/// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Serialize, Deserialize)]
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct Lsn(pub u64);
|
||||
|
||||
@@ -138,7 +138,7 @@ impl FromStr for Lsn {
|
||||
///
|
||||
/// If the input string is missing the '/' character, then use `Lsn::from_hex`
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let mut splitter = s.split('/');
|
||||
let mut splitter = s.trim().split('/');
|
||||
if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next())
|
||||
{
|
||||
let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
|
||||
@@ -270,6 +270,11 @@ mod tests {
|
||||
);
|
||||
assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0)));
|
||||
assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError));
|
||||
|
||||
let expected_lsn = Lsn(0x3C490F8);
|
||||
assert_eq!(" 0/3C490F8".parse(), Ok(expected_lsn));
|
||||
assert_eq!("0/3C490F8 ".parse(), Ok(expected_lsn));
|
||||
assert_eq!(" 0/3C490F8 ".parse(), Ok(expected_lsn));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -3,10 +3,10 @@
|
||||
//! implementation determining how to process the queries. Currently its API
|
||||
//! is rather narrow, but we can extend it once required.
|
||||
|
||||
use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
|
||||
use crate::sock_split::{BidiStream, ReadStream, WriteStream};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
//! is rather narrow, but we can extend it once required.
|
||||
|
||||
use crate::postgres_backend::AuthType;
|
||||
use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
|
||||
use rand::Rng;
|
||||
use std::future::Future;
|
||||
use std::net::SocketAddr;
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
// This could be in ../src/logging.rs but since the logger is global, these
|
||||
// can't be run in threads of the same process
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Lines};
|
||||
use tracing::*;
|
||||
use utils::test_init_file_logger;
|
||||
|
||||
fn read_lines(file: File) -> Lines<BufReader<File>> {
|
||||
BufReader::new(file).lines()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_format_has_message_and_custom_field() {
|
||||
std::env::set_var("RUST_LOG", "info");
|
||||
|
||||
let log_file = test_init_file_logger!("info", "json");
|
||||
|
||||
let custom_field: &str = "hi";
|
||||
trace!(custom = %custom_field, "test log message");
|
||||
debug!(custom = %custom_field, "test log message");
|
||||
info!(custom = %custom_field, "test log message");
|
||||
warn!(custom = %custom_field, "test log message");
|
||||
error!(custom = %custom_field, "test log message");
|
||||
|
||||
let lines = read_lines(log_file);
|
||||
for line in lines {
|
||||
let content = line.unwrap();
|
||||
let json_object = serde_json::from_str::<serde_json::Value>(&content).unwrap();
|
||||
|
||||
assert_eq!(json_object["fields"]["custom"], "hi");
|
||||
assert_eq!(json_object["fields"]["message"], "test log message");
|
||||
|
||||
assert_ne!(json_object["level"], "TRACE");
|
||||
assert_ne!(json_object["level"], "DEBUG");
|
||||
}
|
||||
}
|
||||
@@ -1,36 +0,0 @@
|
||||
// This could be in ../src/logging.rs but since the logger is global, these
|
||||
// can't be run in threads of the same process
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Lines};
|
||||
use tracing::*;
|
||||
use utils::test_init_file_logger;
|
||||
|
||||
fn read_lines(file: File) -> Lines<BufReader<File>> {
|
||||
BufReader::new(file).lines()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_plain_format_has_message_and_custom_field() {
|
||||
std::env::set_var("RUST_LOG", "warn");
|
||||
|
||||
let log_file = test_init_file_logger!("warn", "plain");
|
||||
|
||||
let custom_field: &str = "hi";
|
||||
trace!(custom = %custom_field, "test log message");
|
||||
debug!(custom = %custom_field, "test log message");
|
||||
info!(custom = %custom_field, "test log message");
|
||||
warn!(custom = %custom_field, "test log message");
|
||||
error!(custom = %custom_field, "test log message");
|
||||
|
||||
let lines = read_lines(log_file);
|
||||
for line in lines {
|
||||
let content = line.unwrap();
|
||||
serde_json::from_str::<serde_json::Value>(&content).unwrap_err();
|
||||
assert!(content.contains("custom=hi"));
|
||||
assert!(content.contains("test log message"));
|
||||
|
||||
assert!(!content.contains("TRACE"));
|
||||
assert!(!content.contains("DEBUG"));
|
||||
assert!(!content.contains("INFO"));
|
||||
}
|
||||
}
|
||||
@@ -12,62 +12,61 @@ testing = ["fail/failpoints"]
|
||||
profiling = ["pprof"]
|
||||
|
||||
[dependencies]
|
||||
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
bytes = "1.0.1"
|
||||
chrono = "0.4.19"
|
||||
clap = { version = "4.0", features = ["string"] }
|
||||
close_fds = "0.3.2"
|
||||
const_format = "0.2.21"
|
||||
crc32c = "0.6.0"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = "0.5.0"
|
||||
futures = "0.3.13"
|
||||
git-version = "0.3.5"
|
||||
hex = "0.4.3"
|
||||
humantime = "2.1.0"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
clap = { version = "4.0", features = ["string"] }
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
nix = "0.25"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13.0"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
crc32c = "0.6.0"
|
||||
thiserror = "1.0"
|
||||
tar = "0.4.33"
|
||||
humantime = "2.1.0"
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
rstar = "0.9.3"
|
||||
scopeguard = "1.1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "2.0"
|
||||
humantime-serde = "1.1.1"
|
||||
|
||||
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
|
||||
|
||||
toml_edit = { version = "0.14", features = ["easy"] }
|
||||
scopeguard = "1.1.0"
|
||||
const_format = "0.2.21"
|
||||
tracing = "0.1.36"
|
||||
signal-hook = "0.3.10"
|
||||
svg_fmt = "0.4.1"
|
||||
tar = "0.4.33"
|
||||
thiserror = "1.0"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
|
||||
toml_edit = { version = "0.14", features = ["easy"] }
|
||||
tracing = "0.1.36"
|
||||
url = "2"
|
||||
nix = "0.25"
|
||||
once_cell = "1.13.0"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = "0.5.0"
|
||||
git-version = "0.3.5"
|
||||
rstar = "0.9.3"
|
||||
num-traits = "0.2.15"
|
||||
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
|
||||
walkdir = "2.3.2"
|
||||
|
||||
pageserver_api = { path = "../libs/pageserver_api" }
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
etcd_broker = { path = "../libs/etcd_broker" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
utils = { path = "../libs/utils" }
|
||||
pageserver_api = { path = "../libs/pageserver_api" }
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
pq_proto = { path = "../libs/pq_proto" }
|
||||
remote_storage = { path = "../libs/remote_storage" }
|
||||
tenant_size_model = { path = "../libs/tenant_size_model" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
close_fds = "0.3.2"
|
||||
walkdir = "2.3.2"
|
||||
svg_fmt = "0.4.1"
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.4"
|
||||
@@ -77,3 +76,7 @@ tempfile = "3.2"
|
||||
[[bench]]
|
||||
name = "bench_layer_map"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "bench_walredo"
|
||||
harness = false
|
||||
|
||||
453
pageserver/benches/bench_walredo.rs
Normal file
453
pageserver/benches/bench_walredo.rs
Normal file
File diff suppressed because one or more lines are too long
@@ -1,38 +1,37 @@
|
||||
//! Main entry point for the Page Server executable.
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use std::{env, ops::ControlFlow, path::Path, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use fail::FailScenario;
|
||||
use nix::unistd::Pid;
|
||||
use tracing::*;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use fail::FailScenario;
|
||||
use metrics::set_build_info_metric;
|
||||
|
||||
use pageserver::{
|
||||
config::{defaults::*, PageServerConf},
|
||||
http, page_cache, page_image_cache, page_service, profiling, task_mgr,
|
||||
http, page_cache, page_service, profiling, task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::{
|
||||
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
|
||||
},
|
||||
tenant_mgr, virtual_file, LOG_FILE_NAME,
|
||||
tenant_mgr, virtual_file,
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
logging,
|
||||
lock_file, logging,
|
||||
postgres_backend::AuthType,
|
||||
project_git_version,
|
||||
shutdown::exit_now,
|
||||
signals::{self, Signal},
|
||||
tcp_listener,
|
||||
};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
|
||||
const FEATURES: &[&str] = &[
|
||||
#[cfg(feature = "testing")]
|
||||
"testing",
|
||||
@@ -65,6 +64,7 @@ fn main() -> anyhow::Result<()> {
|
||||
let workdir = workdir
|
||||
.canonicalize()
|
||||
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
|
||||
|
||||
let cfg_file_path = workdir.join("pageserver.toml");
|
||||
|
||||
// Set CWD to workdir for non-daemon modes
|
||||
@@ -75,8 +75,6 @@ fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
})?;
|
||||
|
||||
let daemonize = arg_matches.get_flag("daemonize");
|
||||
|
||||
let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
|
||||
ControlFlow::Continue(conf) => conf,
|
||||
ControlFlow::Break(()) => {
|
||||
@@ -101,9 +99,8 @@ fn main() -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
page_image_cache::init(64 * conf.page_cache_size); // temporary hack for benchmarking
|
||||
|
||||
start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
|
||||
start_pageserver(conf).context("Failed to start pageserver")?;
|
||||
|
||||
scenario.teardown();
|
||||
Ok(())
|
||||
@@ -198,12 +195,48 @@ fn initialize_config(
|
||||
})
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
|
||||
// Initialize logger
|
||||
let log_file = logging::init(LOG_FILE_NAME, daemonize, conf.log_format)?;
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
logging::init(conf.log_format)?;
|
||||
info!("version: {}", version());
|
||||
|
||||
// If any failpoints were set from FAILPOINTS environment variable,
|
||||
// print them to the log for debugging purposes
|
||||
let failpoints = fail::list();
|
||||
if !failpoints.is_empty() {
|
||||
info!(
|
||||
"started with failpoints: {}",
|
||||
failpoints
|
||||
.iter()
|
||||
.map(|(name, actions)| format!("{name}={actions}"))
|
||||
.collect::<Vec<String>>()
|
||||
.join(";")
|
||||
)
|
||||
}
|
||||
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
|
||||
lock_file::LockCreationResult::Created {
|
||||
new_lock_contents,
|
||||
file,
|
||||
} => {
|
||||
info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
|
||||
file
|
||||
}
|
||||
lock_file::LockCreationResult::AlreadyLocked {
|
||||
existing_lock_contents,
|
||||
} => anyhow::bail!(
|
||||
"Could not lock pid file; pageserver is already running in {:?} with PID {}",
|
||||
conf.workdir,
|
||||
existing_lock_contents
|
||||
),
|
||||
lock_file::LockCreationResult::CreationFailed(e) => {
|
||||
return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
|
||||
}
|
||||
};
|
||||
// ensure that the lock file is held even if the main thread of the process is panics
|
||||
// we need to release the lock file only when the current process is gone
|
||||
let _ = Box::leak(Box::new(lock_file));
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
// bind sockets before daemonizing so we report errors early and do not return until we are listening
|
||||
@@ -219,33 +252,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
);
|
||||
let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
|
||||
|
||||
// NB: Don't spawn any threads before daemonizing!
|
||||
if daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
// There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
let stdout = log_file
|
||||
.try_clone()
|
||||
.with_context(|| format!("Failed to clone log file '{:?}'", log_file))?;
|
||||
let stderr = log_file;
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file("pageserver.pid")
|
||||
.working_directory(".")
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
|
||||
// XXX: The parent process should exit abruptly right after
|
||||
// it has spawned a child to prevent coverage machinery from
|
||||
// dumping stats into a `profraw` file now owned by the child.
|
||||
// Otherwise, the coverage data will be damaged.
|
||||
match daemonize.exit_action(|| exit_now(0)).start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(err) => bail!("{err}. could not daemonize. bailing."),
|
||||
}
|
||||
}
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
|
||||
// start profiler (if enabled)
|
||||
@@ -348,14 +354,6 @@ fn cli() -> Command {
|
||||
Command::new("Neon page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.version(version())
|
||||
.arg(
|
||||
|
||||
Arg::new("daemonize")
|
||||
.short('d')
|
||||
.long("daemonize")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Run in the background"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("init")
|
||||
.long("init")
|
||||
|
||||
@@ -8,7 +8,9 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use std::env;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::ConnectionId;
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
@@ -48,6 +50,9 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_LOG_FORMAT: &str = "plain";
|
||||
|
||||
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
|
||||
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -67,6 +72,9 @@ pub mod defaults {
|
||||
#initial_superuser_name = '{DEFAULT_SUPERUSER}'
|
||||
|
||||
#log_format = '{DEFAULT_LOG_FORMAT}'
|
||||
|
||||
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -132,6 +140,9 @@ pub struct PageServerConf {
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
|
||||
pub log_format: LogFormat,
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
|
||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
@@ -200,6 +211,8 @@ struct PageServerConfigBuilder {
|
||||
broker_endpoints: BuilderValue<Vec<Url>>,
|
||||
|
||||
log_format: BuilderValue<LogFormat>,
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -228,6 +241,8 @@ impl Default for PageServerConfigBuilder {
|
||||
broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
|
||||
broker_endpoints: Set(Vec::new()),
|
||||
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -304,6 +319,10 @@ impl PageServerConfigBuilder {
|
||||
self.log_format = BuilderValue::Set(log_format)
|
||||
}
|
||||
|
||||
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) {
|
||||
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let broker_endpoints = self
|
||||
.broker_endpoints
|
||||
@@ -349,6 +368,11 @@ impl PageServerConfigBuilder {
|
||||
.broker_etcd_prefix
|
||||
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
|
||||
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
|
||||
concurrent_tenant_size_logical_size_queries: self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or(anyhow!(
|
||||
"missing concurrent_tenant_size_logical_size_queries"
|
||||
))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -391,6 +415,22 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn traces_path(&self) -> PathBuf {
|
||||
self.workdir.join("traces")
|
||||
}
|
||||
|
||||
pub fn trace_path(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
connection_id: &ConnectionId,
|
||||
) -> PathBuf {
|
||||
self.traces_path()
|
||||
.join(tenant_id.to_string())
|
||||
.join(timeline_id.to_string())
|
||||
.join(connection_id.to_string())
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
|
||||
@@ -476,6 +516,12 @@ impl PageServerConf {
|
||||
"log_format" => builder.log_format(
|
||||
LogFormat::from_config(&parse_toml_string(key, item)?)?
|
||||
),
|
||||
"concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
|
||||
let input = parse_toml_string(key, item)?;
|
||||
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
|
||||
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
|
||||
ConfigurableSemaphore::new(permits)
|
||||
}),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -568,8 +614,9 @@ impl PageServerConf {
|
||||
PathBuf::from(format!("../tmp_check/test_{test_name}"))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn dummy_conf(repo_dir: PathBuf) -> Self {
|
||||
let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
|
||||
|
||||
PageServerConf {
|
||||
id: NodeId(0),
|
||||
wait_lsn_timeout: Duration::from_secs(60),
|
||||
@@ -580,7 +627,7 @@ impl PageServerConf {
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
superuser: "cloud_admin".to_string(),
|
||||
workdir: repo_dir,
|
||||
pg_distrib_dir: PathBuf::new(),
|
||||
pg_distrib_dir,
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
@@ -589,6 +636,7 @@ impl PageServerConf {
|
||||
broker_endpoints: Vec::new(),
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -654,6 +702,58 @@ fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Configurable semaphore permits setting.
|
||||
///
|
||||
/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
|
||||
/// semaphore cannot be distinguished, leading any feature using these to await forever (or until
|
||||
/// new permits are added).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ConfigurableSemaphore {
|
||||
initial_permits: NonZeroUsize,
|
||||
inner: std::sync::Arc<tokio::sync::Semaphore>,
|
||||
}
|
||||
|
||||
impl ConfigurableSemaphore {
|
||||
pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
|
||||
Some(x) => x,
|
||||
None => panic!("const unwrap is not yet stable"),
|
||||
};
|
||||
|
||||
/// Initializse using a non-zero amount of permits.
|
||||
///
|
||||
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
|
||||
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
|
||||
/// behave like [`futures::future::pending`], just waiting until new permits are added.
|
||||
pub fn new(initial_permits: NonZeroUsize) -> Self {
|
||||
ConfigurableSemaphore {
|
||||
initial_permits,
|
||||
inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ConfigurableSemaphore {
|
||||
fn default() -> Self {
|
||||
Self::new(Self::DEFAULT_INITIAL)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for ConfigurableSemaphore {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
// the number of permits can be increased at runtime, so we cannot really fulfill the
|
||||
// PartialEq value equality otherwise
|
||||
self.initial_permits == other.initial_permits
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for ConfigurableSemaphore {}
|
||||
|
||||
impl ConfigurableSemaphore {
|
||||
pub fn inner(&self) -> &std::sync::Arc<tokio::sync::Semaphore> {
|
||||
&self.inner
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{
|
||||
@@ -725,6 +825,7 @@ log_format = 'json'
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -770,6 +871,7 @@ log_format = 'json'
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::Json,
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -354,6 +354,54 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/size:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
get:
|
||||
description: |
|
||||
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
|
||||
responses:
|
||||
"200":
|
||||
description: OK,
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
required:
|
||||
- id
|
||||
- size
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
format: hex
|
||||
size:
|
||||
type: integer
|
||||
description: |
|
||||
Size metric in bytes.
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -619,6 +667,7 @@ components:
|
||||
- disk_consistent_lsn
|
||||
- awaits_download
|
||||
- state
|
||||
- latest_gc_cutoff_lsn
|
||||
properties:
|
||||
timeline_id:
|
||||
type: string
|
||||
@@ -663,6 +712,9 @@ components:
|
||||
type: boolean
|
||||
state:
|
||||
type: string
|
||||
latest_gc_cutoff_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
|
||||
# These 'local' and 'remote' fields just duplicate some of the fields
|
||||
# above. They are kept for backwards-compatibility. They can be removed,
|
||||
|
||||
@@ -227,13 +227,10 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timelines = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
|
||||
let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
Ok(tenant.list_timelines())
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
})?;
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
@@ -523,9 +520,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
// if tenant is in progress of downloading it can be absent in global tenant map
|
||||
let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false))
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false);
|
||||
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
@@ -571,6 +566,44 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
)
|
||||
}
|
||||
|
||||
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// this can be long operation, it currently is not backed by any request coalescing or similar
|
||||
let inputs = tenant
|
||||
.gather_size_inputs()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
|
||||
|
||||
/// Private response type with the additional "unstable" `inputs` field.
|
||||
///
|
||||
/// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
|
||||
/// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Serialize)]
|
||||
struct TenantHistorySize {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
id: TenantId,
|
||||
/// Size is a mixture of WAL and logical size, so the unit is bytes.
|
||||
size: u64,
|
||||
inputs: crate::tenant::size::ModelInputs,
|
||||
}
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
TenantHistorySize {
|
||||
id: tenant_id,
|
||||
size,
|
||||
inputs,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
// Helper function to standardize the error messages we produce on bad durations
|
||||
//
|
||||
// Intended to be used with anyhow's `with_context`, e.g.:
|
||||
@@ -585,6 +618,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
println!("tenant create: {:?}", request_data.trace_read_requests);
|
||||
let remote_index = get_state(&request).remote_index.clone();
|
||||
|
||||
let mut tenant_conf = TenantConfOpt::default();
|
||||
@@ -626,6 +660,9 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
|
||||
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
|
||||
}
|
||||
if let Some(trace_read_requests) = request_data.trace_read_requests {
|
||||
tenant_conf.trace_read_requests = Some(trace_read_requests);
|
||||
}
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
|
||||
@@ -713,6 +750,9 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
|
||||
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
|
||||
}
|
||||
if let Some(trace_read_requests) = request_data.trace_read_requests {
|
||||
tenant_conf.trace_read_requests = Some(trace_read_requests);
|
||||
}
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
|
||||
@@ -792,14 +832,14 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let _span_guard =
|
||||
info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
@@ -835,6 +875,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
.map_err(ApiError::NotFound)?;
|
||||
timeline
|
||||
.checkpoint(CheckpointConfig::Forced)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -898,6 +939,7 @@ pub fn make_router(
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
.get("/v1/tenant/:tenant_id", tenant_status)
|
||||
.get("/v1/tenant/:tenant_id/size", tenant_size_handler)
|
||||
.put("/v1/tenant/config", tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
||||
|
||||
@@ -5,7 +5,6 @@ pub mod import_datadir;
|
||||
pub mod keyspace;
|
||||
pub mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_image_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod profiling;
|
||||
@@ -16,6 +15,7 @@ pub mod tenant;
|
||||
pub mod tenant_config;
|
||||
pub mod tenant_mgr;
|
||||
pub mod tenant_tasks;
|
||||
pub mod trace;
|
||||
pub mod virtual_file;
|
||||
pub mod walingest;
|
||||
pub mod walreceiver;
|
||||
@@ -44,8 +44,6 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
|
||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||
pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
||||
|
||||
pub const LOG_FILE_NAME: &str = "pageserver.log";
|
||||
|
||||
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
/// Config for the Repository checkpointer
|
||||
@@ -82,7 +80,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
|
||||
// There should be nothing left, but let's be sure
|
||||
task_mgr::shutdown_tasks(None, None, None).await;
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
@@ -31,6 +31,7 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
|
||||
"compact",
|
||||
"create images",
|
||||
"init logical size",
|
||||
"logical size",
|
||||
"load layer map",
|
||||
"gc",
|
||||
];
|
||||
@@ -365,6 +366,7 @@ pub struct TimelineMetrics {
|
||||
pub compact_time_histo: Histogram,
|
||||
pub create_images_time_histo: Histogram,
|
||||
pub init_logical_size_histo: Histogram,
|
||||
pub logical_size_histo: Histogram,
|
||||
pub load_layer_map_histo: Histogram,
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub wait_lsn_time_histo: Histogram,
|
||||
@@ -397,6 +399,9 @@ impl TimelineMetrics {
|
||||
let init_logical_size_histo = STORAGE_TIME
|
||||
.get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let logical_size_histo = STORAGE_TIME
|
||||
.get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let load_layer_map_histo = STORAGE_TIME
|
||||
.get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -428,6 +433,7 @@ impl TimelineMetrics {
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
init_logical_size_histo,
|
||||
logical_size_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
wait_lsn_time_histo,
|
||||
|
||||
@@ -108,10 +108,10 @@ enum CacheKey {
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
pub struct MaterializedPageHashKey {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub key: Key,
|
||||
struct MaterializedPageHashKey {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key: Key,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
|
||||
@@ -1,345 +0,0 @@
|
||||
//!
|
||||
//! Global page image cache
|
||||
//!
|
||||
//! Unlike page_cache it holds only most recent version of reconstructed page images.
|
||||
//! And it uses invalidation mechanism to avoid layer ap lookups.
|
||||
|
||||
use crate::page_cache::MaterializedPageHashKey;
|
||||
use crate::pgdatadir_mapping::{rel_block_to_key, BlockNumber};
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::Bytes;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::sync::{Arc, Condvar, Mutex};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
static PAGE_CACHE: OnceCell<Mutex<PageImageCache>> = OnceCell::new();
|
||||
const TEST_PAGE_CACHE_SIZE: usize = 50;
|
||||
pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize;
|
||||
|
||||
enum PageImageState {
|
||||
Vacant, // entry is not used
|
||||
Loaded(bool), // page is loaded or has failed
|
||||
Loading(Option<Arc<Condvar>>), // page in process of loading, Condvar is created on demand when some thread need to wait load completion
|
||||
}
|
||||
|
||||
struct CacheEntry {
|
||||
key: MaterializedPageHashKey,
|
||||
|
||||
// next+prev are used for LRU L2-list and next is also used for L1 free pages list
|
||||
next: usize,
|
||||
prev: usize,
|
||||
|
||||
collision: usize, // L1 hash collision chain
|
||||
|
||||
access_count: u32,
|
||||
state: PageImageState,
|
||||
}
|
||||
|
||||
pub struct PageImageCache {
|
||||
free_list: usize, // L1 list of free entries
|
||||
pages: Vec<CacheEntry>,
|
||||
hash_table: Vec<usize>, // indexes in pages array
|
||||
file: Arc<VirtualFile>,
|
||||
}
|
||||
|
||||
///
|
||||
/// Initialize the page cache. This must be called once at page server startup.
|
||||
///
|
||||
pub fn init(size: usize) {
|
||||
if PAGE_CACHE
|
||||
.set(Mutex::new(PageImageCache::new(size)))
|
||||
.is_err()
|
||||
{
|
||||
panic!("page cache already initialized");
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a handle to the page cache.
|
||||
///
|
||||
pub fn get() -> &'static Mutex<PageImageCache> {
|
||||
//
|
||||
// In unit tests, page server startup doesn't happen and no one calls
|
||||
// page_image_cache::init(). Initialize it here with a tiny cache, so that the
|
||||
// page cache is usable in unit tests.
|
||||
//
|
||||
if cfg!(test) {
|
||||
PAGE_CACHE.get_or_init(|| Mutex::new(PageImageCache::new(TEST_PAGE_CACHE_SIZE)))
|
||||
} else {
|
||||
PAGE_CACHE.get().expect("page cache not initialized")
|
||||
}
|
||||
}
|
||||
|
||||
fn hash<T: Hash>(t: &T) -> usize {
|
||||
let mut s = DefaultHasher::new();
|
||||
t.hash(&mut s);
|
||||
s.finish() as usize
|
||||
}
|
||||
|
||||
impl PageImageCache {
|
||||
fn new(size: usize) -> Self {
|
||||
let mut pages: Vec<CacheEntry> = Vec::with_capacity(size + 1);
|
||||
let hash_table = vec![0usize; size];
|
||||
let file = Arc::new(
|
||||
VirtualFile::open_with_options(
|
||||
&std::path::PathBuf::from("page.cache"),
|
||||
std::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true),
|
||||
)
|
||||
.unwrap(),
|
||||
);
|
||||
// Dummy key
|
||||
let dummy_key = MaterializedPageHashKey {
|
||||
key: Key::MIN,
|
||||
tenant_id: TenantId::from([0u8; 16]),
|
||||
timeline_id: TimelineId::from([0u8; 16]),
|
||||
};
|
||||
|
||||
// LRU list head
|
||||
pages.push(CacheEntry {
|
||||
key: dummy_key.clone(),
|
||||
next: 0,
|
||||
prev: 0,
|
||||
access_count: 0,
|
||||
collision: 0,
|
||||
state: PageImageState::Vacant,
|
||||
});
|
||||
|
||||
// Construct L1 free page list
|
||||
for i in 0..size {
|
||||
pages.push(CacheEntry {
|
||||
key: dummy_key.clone(),
|
||||
next: i + 2, // build L1-list of free pages
|
||||
prev: 0,
|
||||
access_count: 0,
|
||||
collision: 0,
|
||||
state: PageImageState::Vacant,
|
||||
});
|
||||
}
|
||||
pages[size - 1].next = 0; // en of free page list
|
||||
|
||||
PageImageCache {
|
||||
free_list: 1,
|
||||
pages,
|
||||
hash_table,
|
||||
file,
|
||||
}
|
||||
}
|
||||
|
||||
// Unlink from L2-list
|
||||
fn unlink(&mut self, index: usize) {
|
||||
let next = self.pages[index].next;
|
||||
let prev = self.pages[index].prev;
|
||||
self.pages[next].prev = prev;
|
||||
self.pages[prev].next = next;
|
||||
}
|
||||
|
||||
// Link in L2-list after specified element
|
||||
fn link_after(&mut self, after: usize, index: usize) {
|
||||
let next = self.pages[after].next;
|
||||
self.pages[index].prev = after;
|
||||
self.pages[index].next = next;
|
||||
self.pages[next].prev = index;
|
||||
self.pages[after].next = index;
|
||||
}
|
||||
|
||||
fn prune(&mut self, index: usize) {
|
||||
self.pages[index].prev = index;
|
||||
self.pages[index].next = index;
|
||||
}
|
||||
|
||||
fn is_empty(&self, index: usize) -> bool {
|
||||
self.pages[index].next == index
|
||||
}
|
||||
}
|
||||
|
||||
// Remove entry from cache: o page invalidation or drop relation
|
||||
pub fn remove(key: Key, tenant_id: TenantId, timeline_id: TimelineId) {
|
||||
let key = MaterializedPageHashKey {
|
||||
key,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
let this = get();
|
||||
let mut cache = this.lock().unwrap();
|
||||
let h = hash(&key) % cache.hash_table.len();
|
||||
let mut index = cache.hash_table[h];
|
||||
let mut prev = 0usize;
|
||||
while index != 0 {
|
||||
if cache.pages[index].key == key {
|
||||
if !cache.is_empty(index) {
|
||||
cache.pages[index].state = PageImageState::Vacant;
|
||||
// Remove from LRU list
|
||||
cache.unlink(index);
|
||||
// Insert entry in free list
|
||||
cache.pages[index].next = cache.free_list;
|
||||
cache.free_list = index;
|
||||
} else {
|
||||
// Page is process of loading: we can not remove it righ now,
|
||||
// so just mark for deletion
|
||||
cache.pages[index].next = 0; // make is_empty == false
|
||||
}
|
||||
// Remove from hash table
|
||||
if prev == 0 {
|
||||
cache.hash_table[h] = cache.pages[index].collision;
|
||||
} else {
|
||||
cache.pages[prev].collision = cache.pages[index].collision;
|
||||
}
|
||||
break;
|
||||
}
|
||||
prev = index;
|
||||
index = cache.pages[index].collision;
|
||||
}
|
||||
// It's Ok if image not found
|
||||
}
|
||||
|
||||
// Find or load page image in the cache
|
||||
pub fn lookup(timeline: &Timeline, rel: RelTag, blkno: BlockNumber, lsn: Lsn) -> Result<Bytes> {
|
||||
let key = MaterializedPageHashKey {
|
||||
key: rel_block_to_key(rel, blkno),
|
||||
tenant_id: timeline.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
};
|
||||
let this = get();
|
||||
let mut cache = this.lock().unwrap();
|
||||
let h = hash(&key) % cache.hash_table.len();
|
||||
|
||||
'lookup: loop {
|
||||
let mut index = cache.hash_table[h];
|
||||
while index != 0 {
|
||||
if cache.pages[index].key == key {
|
||||
// cache hit
|
||||
match &cache.pages[index].state {
|
||||
PageImageState::Loaded(success) => {
|
||||
if *success {
|
||||
// Pin page
|
||||
if cache.pages[index].access_count == 0 {
|
||||
cache.unlink(index);
|
||||
}
|
||||
cache.pages[index].access_count += 1;
|
||||
let file = cache.file.clone();
|
||||
drop(cache);
|
||||
let mut buf = [0u8; PAGE_SZ];
|
||||
file.read_exact_at(&mut buf, index as u64 * PAGE_SZ as u64)?;
|
||||
cache = this.lock().unwrap();
|
||||
assert!(cache.pages[index].access_count > 0);
|
||||
cache.pages[index].access_count -= 1;
|
||||
if cache.pages[index].access_count == 0 {
|
||||
// Move to the head of LRU list
|
||||
cache.link_after(0, index);
|
||||
}
|
||||
return Ok(Bytes::from(buf.to_vec()));
|
||||
} else {
|
||||
return Err(anyhow::anyhow!("page loading failed earlier"));
|
||||
}
|
||||
}
|
||||
PageImageState::Loading(event) => {
|
||||
// Create event on which to sleep if not yet assigned
|
||||
let cv = match event {
|
||||
None => {
|
||||
let cv = Arc::new(Condvar::new());
|
||||
cache.pages[index].state =
|
||||
PageImageState::Loading(Some(cv.clone()));
|
||||
cv
|
||||
}
|
||||
Some(cv) => cv.clone(),
|
||||
};
|
||||
cache = cv.wait(cache).unwrap();
|
||||
// Retry lookup
|
||||
continue 'lookup;
|
||||
}
|
||||
PageImageState::Vacant => bail!("Vacant entry is not expected here"),
|
||||
};
|
||||
}
|
||||
index = cache.pages[index].collision;
|
||||
}
|
||||
let file = cache.file.clone();
|
||||
// Cache miss
|
||||
index = cache.free_list;
|
||||
if index == 0 {
|
||||
// no free items
|
||||
let victim = cache.pages[0].prev; // take least recently used element from the tail of LRU list
|
||||
assert!(victim != 0);
|
||||
assert!(cache.pages[victim].access_count == 0);
|
||||
// Remove victim from hash table
|
||||
let h = hash(&cache.pages[victim].key) % cache.hash_table.len();
|
||||
index = cache.hash_table[h];
|
||||
let mut prev = 0usize;
|
||||
while index != victim {
|
||||
assert!(index != 0);
|
||||
prev = index;
|
||||
index = cache.pages[index].collision;
|
||||
}
|
||||
if prev == 0 {
|
||||
cache.hash_table[h] = cache.pages[victim].collision;
|
||||
} else {
|
||||
cache.pages[prev].collision = cache.pages[victim].collision;
|
||||
}
|
||||
// and from LRU list
|
||||
cache.unlink(victim);
|
||||
|
||||
index = victim;
|
||||
} else {
|
||||
// Use next free item
|
||||
cache.free_list = cache.pages[index].next;
|
||||
}
|
||||
// Make is_empty(index) == true. If entry is removed in process of loaded,
|
||||
// it will be updated so that !is_empty(index)
|
||||
cache.prune(index);
|
||||
|
||||
// Insert in hash table
|
||||
cache.pages[index].collision = cache.hash_table[h];
|
||||
cache.hash_table[h] = index;
|
||||
|
||||
cache.pages[index].key = key;
|
||||
cache.pages[index].state = PageImageState::Loading(None);
|
||||
drop(cache); //release lock
|
||||
|
||||
// Load page
|
||||
let result = timeline.get_rel_page_at_lsn(rel, blkno, lsn, true);
|
||||
let mut success = false;
|
||||
if let Ok(page) = &result {
|
||||
success = true;
|
||||
file.write_all_at(&page, index as u64 * PAGE_SZ as u64)?;
|
||||
}
|
||||
cache = this.lock().unwrap();
|
||||
if let PageImageState::Loading(event) = &cache.pages[index].state {
|
||||
// Are there some waiting threads?
|
||||
if let Some(cv) = event {
|
||||
// If so, then wakeup them
|
||||
cv.notify_all();
|
||||
}
|
||||
} else {
|
||||
bail!("Loading state is expected");
|
||||
}
|
||||
if cache.is_empty(index) {
|
||||
// entry was not marked as deleted {
|
||||
// Page is loaded
|
||||
|
||||
// match &res { ... } is same as `res.as_ref().ok().cloned()`
|
||||
cache.pages[index].state = PageImageState::Loaded(success);
|
||||
// Link the page to the head of LRU list
|
||||
cache.link_after(0, index);
|
||||
} else {
|
||||
cache.pages[index].state = PageImageState::Vacant;
|
||||
// Return page to free list
|
||||
cache.pages[index].next = cache.free_list;
|
||||
cache.free_list = index;
|
||||
}
|
||||
// only the first one gets the full error from `get_rel_page_at_lsn`
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@
|
||||
//
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Buf;
|
||||
use bytes::Bytes;
|
||||
use futures::{Stream, StreamExt};
|
||||
use pageserver_api::models::{
|
||||
@@ -18,22 +19,23 @@ use pageserver_api::models::{
|
||||
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
|
||||
PagestreamNblocksRequest, PagestreamNblocksResponse,
|
||||
};
|
||||
|
||||
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
|
||||
use std::io;
|
||||
use std::net::TcpListener;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use tokio::pin;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::io::SyncIoBridge;
|
||||
use tracing::*;
|
||||
use utils::id::ConnectionId;
|
||||
use utils::{
|
||||
auth::{self, Claims, JwtAuth, Scope},
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
postgres_backend_async::{self, PostgresBackend},
|
||||
pq_proto::{BeMessage, FeMessage, RowDescriptor},
|
||||
simple_rcu::RcuReadGuard,
|
||||
};
|
||||
|
||||
@@ -41,12 +43,12 @@ use crate::basebackup;
|
||||
use crate::config::{PageServerConf, ProfilingConfig};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||
use crate::page_image_cache;
|
||||
use crate::profiling::profpoint_start;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant_mgr;
|
||||
use crate::trace::Tracer;
|
||||
use crate::CheckpointConfig;
|
||||
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
@@ -74,6 +76,12 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
FeMessage::CopyDone => { break },
|
||||
FeMessage::Sync => continue,
|
||||
FeMessage::Terminate => {
|
||||
let msg = format!("client terminated connection with Terminate message during COPY");
|
||||
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
break;
|
||||
}
|
||||
m => {
|
||||
let msg = format!("unexpected message {:?}", m);
|
||||
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
|
||||
@@ -85,10 +93,10 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
|
||||
yield copy_data_bytes;
|
||||
}
|
||||
Ok(None) => {
|
||||
let msg = "client closed connection";
|
||||
let msg = "client closed connection during COPY";
|
||||
pgb.write_message(&BeMessage::ErrorResponse(msg))?;
|
||||
pgb.flush().await?;
|
||||
Err(io::Error::new(io::ErrorKind::Other, msg))?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
}
|
||||
Err(e) => {
|
||||
Err(io::Error::new(io::ErrorKind::Other, e))?;
|
||||
@@ -269,6 +277,18 @@ impl PageServerHandler {
|
||||
// so there is no need to reset the association
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
|
||||
// Make request tracer if needed
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
|
||||
let mut tracer = if tenant.get_trace_read_requests() {
|
||||
let connection_id = ConnectionId::generate();
|
||||
let path = tenant
|
||||
.conf
|
||||
.trace_path(&tenant_id, &timeline_id, &connection_id);
|
||||
Some(Tracer::new(path))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Check that the timeline exists
|
||||
let timeline = get_local_timeline(tenant_id, timeline_id)?;
|
||||
|
||||
@@ -301,7 +321,12 @@ impl PageServerHandler {
|
||||
|
||||
trace!("query: {copy_data_bytes:?}");
|
||||
|
||||
let neon_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
|
||||
// Trace request if needed
|
||||
if let Some(t) = tracer.as_mut() {
|
||||
t.trace(©_data_bytes)
|
||||
}
|
||||
|
||||
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
|
||||
|
||||
let response = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
@@ -368,14 +393,12 @@ impl PageServerHandler {
|
||||
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
// import_basebackup_from_tar() is not async, mainly because the Tar crate
|
||||
// it uses is not async. So we need to jump through some hoops:
|
||||
// - convert the input from client connection to a synchronous Read
|
||||
// - use block_in_place()
|
||||
let mut copyin_stream = Box::pin(copyin_stream(pgb));
|
||||
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
|
||||
tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?;
|
||||
timeline.initialize()?;
|
||||
let copyin_stream = copyin_stream(pgb);
|
||||
pin!(copyin_stream);
|
||||
|
||||
timeline
|
||||
.import_basebackup_from_tar(&mut copyin_stream, base_lsn)
|
||||
.await?;
|
||||
|
||||
// Drain the rest of the Copy data
|
||||
let mut bytes_after_tar = 0;
|
||||
@@ -440,7 +463,7 @@ impl PageServerHandler {
|
||||
// We only want to persist the data, and it doesn't matter if it's in the
|
||||
// shape of deltas or images.
|
||||
info!("flushing layers");
|
||||
timeline.checkpoint(CheckpointConfig::Flush)?;
|
||||
timeline.checkpoint(CheckpointConfig::Flush).await?;
|
||||
|
||||
info!("done");
|
||||
Ok(())
|
||||
@@ -582,12 +605,8 @@ impl PageServerHandler {
|
||||
// current profiling is based on a thread-local variable, so it doesn't work
|
||||
// across awaits
|
||||
let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
|
||||
let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?;
|
||||
|
||||
let page = if req.latest {
|
||||
page_image_cache::lookup(timeline, req.rel, req.blkno, lsn)
|
||||
} else {
|
||||
timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, false)
|
||||
}?;
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
}))
|
||||
|
||||
@@ -1179,7 +1179,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
|
||||
@@ -12,8 +12,12 @@
|
||||
//!
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use bytes::Bytes;
|
||||
use futures::Stream;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use tokio::sync::watch;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::io::SyncIoBridge;
|
||||
use tracing::*;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
|
||||
@@ -29,6 +33,7 @@ use std::io::Write;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use std::pin::Pin;
|
||||
use std::process::Command;
|
||||
use std::process::Stdio;
|
||||
use std::sync::Arc;
|
||||
@@ -72,6 +77,8 @@ pub mod storage_layer;
|
||||
|
||||
mod timeline;
|
||||
|
||||
pub mod size;
|
||||
|
||||
use storage_layer::Layer;
|
||||
|
||||
pub use timeline::Timeline;
|
||||
@@ -120,6 +127,9 @@ pub struct Tenant {
|
||||
|
||||
/// Makes every timeline to backup their files to remote storage.
|
||||
upload_layers: bool,
|
||||
|
||||
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
}
|
||||
|
||||
/// A timeline with some of its files on disk, being initialized.
|
||||
@@ -132,7 +142,7 @@ pub struct Tenant {
|
||||
pub struct UninitializedTimeline<'t> {
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
raw_timeline: Option<(Timeline, TimelineUninitMark)>,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
|
||||
}
|
||||
|
||||
/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
|
||||
@@ -164,7 +174,6 @@ impl UninitializedTimeline<'_> {
|
||||
let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
|
||||
format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
let new_timeline = Arc::new(new_timeline);
|
||||
|
||||
let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
|
||||
// TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least
|
||||
@@ -192,6 +201,9 @@ impl UninitializedTimeline<'_> {
|
||||
})?;
|
||||
new_timeline.set_state(TimelineState::Active);
|
||||
v.insert(Arc::clone(&new_timeline));
|
||||
|
||||
new_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
new_timeline.launch_wal_receiver();
|
||||
}
|
||||
}
|
||||
@@ -200,20 +212,28 @@ impl UninitializedTimeline<'_> {
|
||||
}
|
||||
|
||||
/// Prepares timeline data by loading it from the basebackup archive.
|
||||
pub fn import_basebackup_from_tar(
|
||||
&self,
|
||||
reader: impl std::io::Read,
|
||||
pub async fn import_basebackup_from_tar(
|
||||
self,
|
||||
mut copyin_stream: &mut Pin<&mut impl Stream<Item = io::Result<Bytes>>>,
|
||||
base_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let raw_timeline = self.raw_timeline()?;
|
||||
import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"Failed to import basebackup for timeline {}/{}",
|
||||
self.owning_tenant.tenant_id, self.timeline_id
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
// import_basebackup_from_tar() is not async, mainly because the Tar crate
|
||||
// it uses is not async. So we need to jump through some hoops:
|
||||
// - convert the input from client connection to a synchronous Read
|
||||
// - use block_in_place()
|
||||
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
|
||||
|
||||
tokio::task::block_in_place(|| {
|
||||
import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn)
|
||||
.context("Failed to import basebackup")
|
||||
})?;
|
||||
|
||||
// Flush loop needs to be spawned in order for checkpoint to be able to flush.
|
||||
// We want to run proper checkpoint before we mark timeline as available to outside world
|
||||
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
|
||||
raw_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
bail!("failpoint before-checkpoint-new-timeline");
|
||||
@@ -221,16 +241,15 @@ impl UninitializedTimeline<'_> {
|
||||
|
||||
raw_timeline
|
||||
.checkpoint(CheckpointConfig::Flush)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to checkpoint after basebackup import for timeline {}/{}",
|
||||
self.owning_tenant.tenant_id, self.timeline_id
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
.await
|
||||
.context("Failed to checkpoint after basebackup import")?;
|
||||
|
||||
let timeline = self.initialize()?;
|
||||
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
fn raw_timeline(&self) -> anyhow::Result<&Timeline> {
|
||||
fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
|
||||
Ok(&self
|
||||
.raw_timeline
|
||||
.as_ref()
|
||||
@@ -442,14 +461,7 @@ impl Tenant {
|
||||
.context("Cannot branch off the timeline that's not present in pageserver")?;
|
||||
|
||||
if let Some(lsn) = ancestor_start_lsn.as_mut() {
|
||||
// Wait for the WAL to arrive and be processed on the parent branch up
|
||||
// to the requested branch point. The repository code itself doesn't
|
||||
// require it, but if we start to receive WAL on the new timeline,
|
||||
// decoding the new WAL might need to look up previous pages, relation
|
||||
// sizes etc. and that would get confused if the previous page versions
|
||||
// are not in the repository yet.
|
||||
*lsn = lsn.align();
|
||||
ancestor_timeline.wait_lsn(*lsn).await?;
|
||||
|
||||
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
|
||||
if ancestor_ancestor_lsn > *lsn {
|
||||
@@ -461,11 +473,19 @@ impl Tenant {
|
||||
ancestor_ancestor_lsn,
|
||||
);
|
||||
}
|
||||
|
||||
// Wait for the WAL to arrive and be processed on the parent branch up
|
||||
// to the requested branch point. The repository code itself doesn't
|
||||
// require it, but if we start to receive WAL on the new timeline,
|
||||
// decoding the new WAL might need to look up previous pages, relation
|
||||
// sizes etc. and that would get confused if the previous page versions
|
||||
// are not in the repository yet.
|
||||
ancestor_timeline.wait_lsn(*lsn).await?;
|
||||
}
|
||||
|
||||
self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
|
||||
}
|
||||
None => self.bootstrap_timeline(new_timeline_id, pg_version)?,
|
||||
None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
|
||||
};
|
||||
|
||||
// Have added new timeline into the tenant, now its background tasks are needed.
|
||||
@@ -483,7 +503,7 @@ impl Tenant {
|
||||
/// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
|
||||
/// to make tests more deterministic.
|
||||
/// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
|
||||
pub fn gc_iteration(
|
||||
pub async fn gc_iteration(
|
||||
&self,
|
||||
target_timeline_id: Option<TimelineId>,
|
||||
horizon: u64,
|
||||
@@ -499,11 +519,13 @@ impl Tenant {
|
||||
.map(|x| x.to_string())
|
||||
.unwrap_or_else(|| "-".to_string());
|
||||
|
||||
STORAGE_TIME
|
||||
.with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
|
||||
.observe_closure_duration(|| {
|
||||
self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
|
||||
})
|
||||
{
|
||||
let _timer = STORAGE_TIME
|
||||
.with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
|
||||
.start_timer();
|
||||
self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform one compaction iteration.
|
||||
@@ -523,7 +545,6 @@ impl Tenant {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timelines_to_compact = timelines
|
||||
.iter()
|
||||
.filter(|(_, timeline)| timeline.is_active())
|
||||
.map(|(timeline_id, timeline)| (*timeline_id, timeline.clone()))
|
||||
.collect::<Vec<_>>();
|
||||
drop(timelines);
|
||||
@@ -540,23 +561,24 @@ impl Tenant {
|
||||
///
|
||||
/// Used at graceful shutdown.
|
||||
///
|
||||
pub fn checkpoint(&self) -> anyhow::Result<()> {
|
||||
pub async fn checkpoint(&self) -> anyhow::Result<()> {
|
||||
// Scan through the hashmap and collect a list of all the timelines,
|
||||
// while holding the lock. Then drop the lock and actually perform the
|
||||
// checkpoints. We don't want to block everything else while the
|
||||
// checkpoint runs.
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timelines_to_checkpoint = timelines
|
||||
.iter()
|
||||
.map(|(timeline_id, timeline)| (*timeline_id, Arc::clone(timeline)))
|
||||
.collect::<Vec<_>>();
|
||||
drop(timelines);
|
||||
let timelines_to_checkpoint = {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
timelines
|
||||
.iter()
|
||||
.map(|(id, timeline)| (*id, Arc::clone(timeline)))
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
for (timeline_id, timeline) in &timelines_to_checkpoint {
|
||||
let _entered =
|
||||
info_span!("checkpoint", timeline = %timeline_id, tenant = %self.tenant_id)
|
||||
.entered();
|
||||
timeline.checkpoint(CheckpointConfig::Flush)?;
|
||||
for (id, timeline) in &timelines_to_checkpoint {
|
||||
timeline
|
||||
.checkpoint(CheckpointConfig::Flush)
|
||||
.instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id))
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -785,6 +807,13 @@ impl Tenant {
|
||||
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
||||
}
|
||||
|
||||
pub fn get_trace_read_requests(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||
tenant_conf
|
||||
.trace_read_requests
|
||||
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
|
||||
}
|
||||
|
||||
pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
|
||||
self.tenant_conf.write().unwrap().update(&new_tenant_conf);
|
||||
}
|
||||
@@ -835,6 +864,7 @@ impl Tenant {
|
||||
remote_index,
|
||||
upload_layers,
|
||||
state,
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -956,8 +986,9 @@ impl Tenant {
|
||||
// +-----baz-------->
|
||||
//
|
||||
//
|
||||
// 1. Grab 'gc_cs' mutex to prevent new timelines from being created
|
||||
// 2. Scan all timelines, and on each timeline, make note of the
|
||||
// 1. Grab 'gc_cs' mutex to prevent new timelines from being created while Timeline's
|
||||
// `gc_infos` are being refreshed
|
||||
// 2. Scan collected timelines, and on each timeline, make note of the
|
||||
// all the points where other timelines have been branched off.
|
||||
// We will refrain from removing page versions at those LSNs.
|
||||
// 3. For each timeline, scan all layer files on the timeline.
|
||||
@@ -968,7 +999,7 @@ impl Tenant {
|
||||
// - if a relation has a non-incremental persistent layer on a child branch, then we
|
||||
// don't need to keep that in the parent anymore. But currently
|
||||
// we do.
|
||||
fn gc_iteration_internal(
|
||||
async fn gc_iteration_internal(
|
||||
&self,
|
||||
target_timeline_id: Option<TimelineId>,
|
||||
horizon: u64,
|
||||
@@ -978,6 +1009,72 @@ impl Tenant {
|
||||
let mut totals: GcResult = Default::default();
|
||||
let now = Instant::now();
|
||||
|
||||
let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;
|
||||
|
||||
utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
|
||||
|
||||
info!("starting on {} timelines", gc_timelines.len());
|
||||
|
||||
// Perform GC for each timeline.
|
||||
//
|
||||
// Note that we don't hold the GC lock here because we don't want
|
||||
// to delay the branch creation task, which requires the GC lock.
|
||||
// A timeline GC iteration can be slow because it may need to wait for
|
||||
// compaction (both require `layer_removal_cs` lock),
|
||||
// but the GC iteration can run concurrently with branch creation.
|
||||
//
|
||||
// See comments in [`Tenant::branch_timeline`] for more information
|
||||
// about why branch creation task can run concurrently with timeline's GC iteration.
|
||||
for timeline in gc_timelines {
|
||||
if task_mgr::is_shutdown_requested() {
|
||||
// We were requested to shut down. Stop and return with the progress we
|
||||
// made.
|
||||
break;
|
||||
}
|
||||
|
||||
// If requested, force flush all in-memory layers to disk first,
|
||||
// so that they too can be garbage collected. That's
|
||||
// used in tests, so we want as deterministic results as possible.
|
||||
if checkpoint_before_gc {
|
||||
timeline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
info!(
|
||||
"timeline {} checkpoint_before_gc done",
|
||||
timeline.timeline_id
|
||||
);
|
||||
}
|
||||
|
||||
let result = timeline.gc()?;
|
||||
totals += result;
|
||||
}
|
||||
|
||||
totals.elapsed = now.elapsed();
|
||||
Ok(totals)
|
||||
}
|
||||
|
||||
/// Refreshes the Timeline::gc_info for all timelines, returning the
|
||||
/// vector of timelines which have [`Timeline::get_last_record_lsn`] past
|
||||
/// [`Tenant::get_gc_horizon`].
|
||||
///
|
||||
/// This is usually executed as part of periodic gc, but can now be triggered more often.
|
||||
pub fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
|
||||
// since this method can now be called at different rates than the configured gc loop, it
|
||||
// might be that these configuration values get applied faster than what it was previously,
|
||||
// since these were only read from the gc task.
|
||||
let horizon = self.get_gc_horizon();
|
||||
let pitr = self.get_pitr_interval();
|
||||
|
||||
// refresh all timelines
|
||||
let target_timeline_id = None;
|
||||
|
||||
self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
|
||||
}
|
||||
|
||||
fn refresh_gc_info_internal(
|
||||
&self,
|
||||
target_timeline_id: Option<TimelineId>,
|
||||
horizon: u64,
|
||||
pitr: Duration,
|
||||
) -> anyhow::Result<Vec<Arc<Timeline>>> {
|
||||
// grab mutex to prevent new timelines from being created here.
|
||||
let gc_cs = self.gc_cs.lock().unwrap();
|
||||
|
||||
@@ -995,11 +1092,7 @@ impl Tenant {
|
||||
|
||||
timelines
|
||||
.iter()
|
||||
.filter(|(_, timeline)| timeline.is_active())
|
||||
.map(|(timeline_id, timeline_entry)| {
|
||||
// This is unresolved question for now, how to do gc in presence of remote timelines
|
||||
// especially when this is combined with branching.
|
||||
// Somewhat related: https://github.com/neondatabase/neon/issues/999
|
||||
if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
|
||||
// If target_timeline is specified, we only need to know branchpoints of its children
|
||||
if let Some(timeline_id) = target_timeline_id {
|
||||
@@ -1053,41 +1146,7 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
drop(gc_cs);
|
||||
|
||||
// Perform GC for each timeline.
|
||||
//
|
||||
// Note that we don't hold the GC lock here because we don't want
|
||||
// to delay the branch creation task, which requires the GC lock.
|
||||
// A timeline GC iteration can be slow because it may need to wait for
|
||||
// compaction (both require `layer_removal_cs` lock),
|
||||
// but the GC iteration can run concurrently with branch creation.
|
||||
//
|
||||
// See comments in [`Tenant::branch_timeline`] for more information
|
||||
// about why branch creation task can run concurrently with timeline's GC iteration.
|
||||
for timeline in gc_timelines {
|
||||
if task_mgr::is_shutdown_requested() {
|
||||
// We were requested to shut down. Stop and return with the progress we
|
||||
// made.
|
||||
break;
|
||||
}
|
||||
|
||||
// If requested, force flush all in-memory layers to disk first,
|
||||
// so that they too can be garbage collected. That's
|
||||
// used in tests, so we want as deterministic results as possible.
|
||||
if checkpoint_before_gc {
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
info!(
|
||||
"timeline {} checkpoint_before_gc done",
|
||||
timeline.timeline_id
|
||||
);
|
||||
}
|
||||
|
||||
let result = timeline.gc()?;
|
||||
totals += result;
|
||||
}
|
||||
|
||||
totals.elapsed = now.elapsed();
|
||||
Ok(totals)
|
||||
Ok(gc_timelines)
|
||||
}
|
||||
|
||||
/// Branch an existing timeline
|
||||
@@ -1191,14 +1250,15 @@ impl Tenant {
|
||||
|
||||
/// - run initdb to init temporary instance and get bootstrap data
|
||||
/// - after initialization complete, remove the temp dir.
|
||||
fn bootstrap_timeline(
|
||||
async fn bootstrap_timeline(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timeline_uninit_mark = self.create_timeline_uninit_mark(timeline_id, &timelines)?;
|
||||
drop(timelines);
|
||||
let timeline_uninit_mark = {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
self.create_timeline_uninit_mark(timeline_id, &timelines)?
|
||||
};
|
||||
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
||||
// temporary directory for basebackup files for the given timeline.
|
||||
let initdb_path = path_with_suffix_extension(
|
||||
@@ -1248,25 +1308,35 @@ impl Tenant {
|
||||
|
||||
let tenant_id = raw_timeline.owning_tenant.tenant_id;
|
||||
let unfinished_timeline = raw_timeline.raw_timeline()?;
|
||||
import_datadir::import_timeline_from_postgres_datadir(
|
||||
unfinished_timeline,
|
||||
pgdata_path,
|
||||
pgdata_lsn,
|
||||
)
|
||||
|
||||
tokio::task::block_in_place(|| {
|
||||
import_datadir::import_timeline_from_postgres_datadir(
|
||||
unfinished_timeline,
|
||||
pgdata_path,
|
||||
pgdata_lsn,
|
||||
)
|
||||
})
|
||||
.with_context(|| {
|
||||
format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
|
||||
// Flush loop needs to be spawned in order for checkpoint to be able to flush.
|
||||
// We want to run proper checkpoint before we mark timeline as available to outside world
|
||||
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
|
||||
unfinished_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
anyhow::bail!("failpoint before-checkpoint-new-timeline");
|
||||
});
|
||||
|
||||
unfinished_timeline
|
||||
.checkpoint(CheckpointConfig::Forced)
|
||||
.checkpoint(CheckpointConfig::Forced).await
|
||||
.with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?;
|
||||
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let timeline = raw_timeline.initialize_with_lock(&mut timelines, false)?;
|
||||
drop(timelines);
|
||||
let timeline = {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
raw_timeline.initialize_with_lock(&mut timelines, false)?
|
||||
};
|
||||
|
||||
info!(
|
||||
"created root timeline {} timeline.lsn {}",
|
||||
@@ -1306,7 +1376,7 @@ impl Tenant {
|
||||
Ok(UninitializedTimeline {
|
||||
owning_tenant: self,
|
||||
timeline_id: new_timeline_id,
|
||||
raw_timeline: Some((new_timeline, uninit_mark)),
|
||||
raw_timeline: Some((Arc::new(new_timeline), uninit_mark)),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -1425,7 +1495,7 @@ impl Tenant {
|
||||
let timeline = UninitializedTimeline {
|
||||
owning_tenant: self,
|
||||
timeline_id,
|
||||
raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())),
|
||||
raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())),
|
||||
};
|
||||
match timeline.initialize_with_lock(&mut timelines_accessor, true) {
|
||||
Ok(initialized_timeline) => {
|
||||
@@ -1446,6 +1516,25 @@ impl Tenant {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gathers inputs from all of the timelines to produce a sizing model input.
|
||||
///
|
||||
/// Future is cancellation safe. Only one calculation can be running at once per tenant.
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||
pub async fn gather_size_inputs(&self) -> anyhow::Result<size::ModelInputs> {
|
||||
let logical_sizes_at_once = self
|
||||
.conf
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.inner();
|
||||
|
||||
// TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
|
||||
// are for testing/experimenting, we tolerate this.
|
||||
//
|
||||
// See more for on the issue #2748 condenced out of the initial PR review.
|
||||
let mut shared_cache = self.cached_logical_sizes.lock().await;
|
||||
|
||||
size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||
@@ -1589,6 +1678,7 @@ pub mod harness {
|
||||
walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
|
||||
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
|
||||
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
|
||||
trace_read_requests: Some(tenant_conf.trace_read_requests),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1860,7 +1950,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
|
||||
async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut lsn = start_lsn;
|
||||
#[allow(non_snake_case)]
|
||||
{
|
||||
@@ -1881,7 +1971,7 @@ mod tests {
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
}
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
{
|
||||
let writer = tline.writer();
|
||||
writer.put(
|
||||
@@ -1898,24 +1988,26 @@ mod tests {
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
}
|
||||
tline.checkpoint(CheckpointConfig::Forced)
|
||||
tline.checkpoint(CheckpointConfig::Forced).await
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let tenant =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
|
||||
.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
|
||||
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
// FIXME: this doesn't actually remove any layer currently, given how the checkpointing
|
||||
// and compaction works. But it does set the 'cutoff' point so that the cross check
|
||||
// below should fail.
|
||||
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
tenant
|
||||
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
|
||||
.await?;
|
||||
|
||||
// try to branch at lsn 25, should fail because we already garbage collected the data
|
||||
match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
|
||||
@@ -1960,14 +2052,14 @@ mod tests {
|
||||
/*
|
||||
// FIXME: This currently fails to error out. Calling GC doesn't currently
|
||||
// remove the old value, we'd need to work a little harder
|
||||
#[test]
|
||||
fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
|
||||
.load();
|
||||
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
|
||||
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
|
||||
@@ -1980,43 +2072,47 @@ mod tests {
|
||||
}
|
||||
*/
|
||||
|
||||
#[test]
|
||||
fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
|
||||
let tenant =
|
||||
TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
|
||||
|
||||
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
let newtline = tenant
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
tenant
|
||||
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
|
||||
.await?;
|
||||
assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
|
||||
let tenant =
|
||||
TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
|
||||
|
||||
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
let newtline = tenant
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
|
||||
|
||||
// run gc on parent
|
||||
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
tenant
|
||||
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
|
||||
.await?;
|
||||
|
||||
// Check that the data is still accessible on the branch.
|
||||
assert_eq!(
|
||||
@@ -2027,8 +2123,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn timeline_load() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn timeline_load() -> anyhow::Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
{
|
||||
@@ -2036,8 +2132,8 @@ mod tests {
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x8000))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
}
|
||||
|
||||
let tenant = harness.load();
|
||||
@@ -2048,8 +2144,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn timeline_load_with_ancestor() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load_with_ancestor";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
// create two timelines
|
||||
@@ -2059,8 +2155,8 @@ mod tests {
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
|
||||
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
|
||||
@@ -2068,8 +2164,8 @@ mod tests {
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
}
|
||||
|
||||
// check that both of them are initially unloaded
|
||||
@@ -2129,8 +2225,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_images() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn test_images() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_images")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2141,7 +2237,7 @@ mod tests {
|
||||
writer.finish_write(Lsn(0x10));
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.compact()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
@@ -2149,7 +2245,7 @@ mod tests {
|
||||
writer.finish_write(Lsn(0x20));
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.compact()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
@@ -2157,7 +2253,7 @@ mod tests {
|
||||
writer.finish_write(Lsn(0x30));
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.compact()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
@@ -2165,7 +2261,7 @@ mod tests {
|
||||
writer.finish_write(Lsn(0x40));
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.compact()?;
|
||||
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
|
||||
@@ -2181,8 +2277,8 @@ mod tests {
|
||||
// Insert 1000 key-value pairs with increasing keys, checkpoint,
|
||||
// repeat 50 times.
|
||||
//
|
||||
#[test]
|
||||
fn test_bulk_insert() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn test_bulk_insert() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_bulk_insert")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2215,7 +2311,7 @@ mod tests {
|
||||
let cutoff = tline.get_last_record_lsn();
|
||||
|
||||
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.compact()?;
|
||||
tline.gc()?;
|
||||
}
|
||||
@@ -2223,8 +2319,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_random_updates() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn test_random_updates() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_random_updates")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2287,7 +2383,7 @@ mod tests {
|
||||
println!("checkpointing {}", lsn);
|
||||
let cutoff = tline.get_last_record_lsn();
|
||||
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.compact()?;
|
||||
tline.gc()?;
|
||||
}
|
||||
@@ -2295,8 +2391,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_traverse_branches() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn test_traverse_branches() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_traverse_branches")?.load();
|
||||
let mut tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2368,7 +2464,7 @@ mod tests {
|
||||
println!("checkpointing {}", lsn);
|
||||
let cutoff = tline.get_last_record_lsn();
|
||||
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.compact()?;
|
||||
tline.gc()?;
|
||||
}
|
||||
|
||||
@@ -74,6 +74,7 @@ where
|
||||
};
|
||||
|
||||
dstbuf.clear();
|
||||
dstbuf.reserve(len);
|
||||
|
||||
// Read the payload
|
||||
let mut remain = len;
|
||||
|
||||
@@ -260,8 +260,9 @@ impl Layer for DeltaLayer {
|
||||
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let mut cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
let buf = cursor.read_blob(pos).with_context(|| {
|
||||
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
@@ -610,9 +611,9 @@ impl DeltaLayer {
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
pub struct DeltaLayerWriter {
|
||||
struct DeltaLayerWriterInner {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
pub path: PathBuf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
|
||||
@@ -624,17 +625,17 @@ pub struct DeltaLayerWriter {
|
||||
blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriter {
|
||||
impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
pub fn new(
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
) -> Result<DeltaLayerWriter> {
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename. We don't know
|
||||
// the end key yet, so we cannot form the final filename yet. We will
|
||||
// rename it when we're done.
|
||||
@@ -653,7 +654,7 @@ impl DeltaLayerWriter {
|
||||
let block_buf = BlockBuf::new();
|
||||
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
||||
|
||||
Ok(DeltaLayerWriter {
|
||||
Ok(Self {
|
||||
conf,
|
||||
path,
|
||||
timeline_id,
|
||||
@@ -670,17 +671,17 @@ impl DeltaLayerWriter {
|
||||
///
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||
fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
|
||||
}
|
||||
|
||||
pub fn put_value_bytes(
|
||||
fn put_value_bytes(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: &[u8],
|
||||
will_init: bool,
|
||||
) -> Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(self.lsn_range.start <= lsn);
|
||||
|
||||
let off = self.blob_writer.write_blob(val)?;
|
||||
@@ -693,14 +694,14 @@ impl DeltaLayerWriter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
fn size(&self) -> u64 {
|
||||
self.blob_writer.size() + self.tree.borrow_writer().size()
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -768,6 +769,102 @@ impl DeltaLayerWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new delta layer.
|
||||
///
|
||||
/// Usage:
|
||||
///
|
||||
/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
|
||||
///
|
||||
/// 2. Write the contents by calling `put_value` for every page
|
||||
/// version to store in the layer.
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
||||
/// possible for the writer to drop before `finish` is actually called. So this
|
||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||
/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
|
||||
/// implementation that cleans up the temporary file in failure. It's not
|
||||
/// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves
|
||||
/// out some fields, making it impossible to implement `Drop`.
|
||||
///
|
||||
#[must_use]
|
||||
pub struct DeltaLayerWriter {
|
||||
inner: Option<DeltaLayerWriterInner>,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriter {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
inner: Some(DeltaLayerWriterInner::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_start,
|
||||
lsn_range,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Append a key-value pair to the file.
|
||||
///
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_value(key, lsn, val)
|
||||
}
|
||||
|
||||
pub fn put_value_bytes(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: &[u8],
|
||||
will_init: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
self.inner
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.put_value_bytes(key, lsn, val, will_init)
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
self.inner.as_ref().unwrap().size()
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
self.inner.take().unwrap().finish(key_end)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DeltaLayerWriter {
|
||||
fn drop(&mut self) {
|
||||
if let Some(inner) = self.inner.take() {
|
||||
match inner.blob_writer.into_inner().into_inner() {
|
||||
Ok(vfile) => vfile.remove(),
|
||||
Err(err) => warn!(
|
||||
"error while flushing buffer of image layer temporary file: {}",
|
||||
err
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator over all key-value pairse stored in a delta layer
|
||||
///
|
||||
|
||||
@@ -411,7 +411,7 @@ impl ImageLayer {
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
pub struct ImageLayerWriter {
|
||||
struct ImageLayerWriterInner {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
timeline_id: TimelineId,
|
||||
@@ -423,14 +423,17 @@ pub struct ImageLayerWriter {
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
}
|
||||
|
||||
impl ImageLayerWriter {
|
||||
pub fn new(
|
||||
impl ImageLayerWriterInner {
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename.
|
||||
// We'll atomically rename it to the final name when we're done.
|
||||
let path = ImageLayer::temp_path_for(
|
||||
@@ -455,7 +458,7 @@ impl ImageLayerWriter {
|
||||
let block_buf = BlockBuf::new();
|
||||
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
||||
|
||||
let writer = ImageLayerWriter {
|
||||
let writer = Self {
|
||||
conf,
|
||||
path,
|
||||
timeline_id,
|
||||
@@ -474,7 +477,7 @@ impl ImageLayerWriter {
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
|
||||
fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
let off = self.blob_writer.write_blob(img)?;
|
||||
|
||||
@@ -485,7 +488,10 @@ impl ImageLayerWriter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self) -> anyhow::Result<ImageLayer> {
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
fn finish(self) -> anyhow::Result<ImageLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -552,3 +558,76 @@ impl ImageLayerWriter {
|
||||
Ok(layer)
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new image layer.
|
||||
///
|
||||
/// Usage:
|
||||
///
|
||||
/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
|
||||
///
|
||||
/// 2. Write the contents by calling `put_page_image` for every key-value
|
||||
/// pair in the key range.
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
||||
/// possible for the writer to drop before `finish` is actually called. So this
|
||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||
/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
|
||||
/// implementation that cleans up the temporary file in failure. It's not
|
||||
/// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
|
||||
/// out some fields, making it impossible to implement `Drop`.
|
||||
///
|
||||
#[must_use]
|
||||
pub struct ImageLayerWriter {
|
||||
inner: Option<ImageLayerWriterInner>,
|
||||
}
|
||||
|
||||
impl ImageLayerWriter {
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
Ok(Self {
|
||||
inner: Some(ImageLayerWriterInner::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_range,
|
||||
lsn,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Write next value to the file.
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_image(key, img)
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
|
||||
self.inner.take().unwrap().finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ImageLayerWriter {
|
||||
fn drop(&mut self) {
|
||||
if let Some(inner) = self.inner.take() {
|
||||
inner.blob_writer.into_inner().remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
461
pageserver/src/tenant/size.rs
Normal file
461
pageserver/src/tenant/size.rs
Normal file
@@ -0,0 +1,461 @@
|
||||
use std::cmp;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
use super::Tenant;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use tracing::*;
|
||||
|
||||
/// Inputs to the actual tenant sizing model
|
||||
///
|
||||
/// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to
|
||||
/// be a transferrable format between execution environments and developer.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, serde::Serialize, serde::Deserialize)]
|
||||
pub struct ModelInputs {
|
||||
updates: Vec<Update>,
|
||||
retention_period: u64,
|
||||
#[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
|
||||
timeline_inputs: HashMap<TimelineId, TimelineInputs>,
|
||||
}
|
||||
|
||||
/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
|
||||
/// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, serde::Serialize, serde::Deserialize)]
|
||||
struct TimelineInputs {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
last_record: Lsn,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
latest_gc_cutoff: Lsn,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
horizon_cutoff: Lsn,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
pitr_cutoff: Lsn,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
next_gc_cutoff: Lsn,
|
||||
}
|
||||
|
||||
/// Gathers the inputs for the tenant sizing model.
|
||||
///
|
||||
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
|
||||
/// is updated on-demand, during the start of this calculation and separate from the
|
||||
/// [`Timeline::latest_gc_cutoff`].
|
||||
///
|
||||
/// For timelines in general:
|
||||
///
|
||||
/// ```ignore
|
||||
/// 0-----|---------|----|------------| · · · · · |·> lsn
|
||||
/// initdb_lsn branchpoints* next_gc_cutoff latest
|
||||
/// ```
|
||||
///
|
||||
/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
|
||||
/// tenant size will be zero.
|
||||
pub(super) async fn gather_inputs(
|
||||
tenant: &Tenant,
|
||||
limit: &Arc<Semaphore>,
|
||||
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
||||
) -> anyhow::Result<ModelInputs> {
|
||||
// with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
|
||||
// our advantage with `?` error handling.
|
||||
let mut joinset = tokio::task::JoinSet::new();
|
||||
|
||||
let timelines = tenant
|
||||
.refresh_gc_info()
|
||||
.context("Failed to refresh gc_info before gathering inputs")?;
|
||||
|
||||
if timelines.is_empty() {
|
||||
// All timelines are below tenant's gc_horizon; alternative would be to use
|
||||
// Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
|
||||
// missing GcInfo::retain_lsns or having obsolete values for cutoff's.
|
||||
return Ok(ModelInputs {
|
||||
updates: vec![],
|
||||
retention_period: 0,
|
||||
timeline_inputs: HashMap::new(),
|
||||
});
|
||||
}
|
||||
|
||||
// record the used/inserted cache keys here, to remove extras not to start leaking
|
||||
// after initial run the cache should be quite stable, but live timelines will eventually
|
||||
// require new lsns to be inspected.
|
||||
let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new();
|
||||
|
||||
let mut updates = Vec::new();
|
||||
|
||||
// record the per timline values used to determine `retention_period`
|
||||
let mut timeline_inputs = HashMap::with_capacity(timelines.len());
|
||||
|
||||
// used to determine the `retention_period` for the size model
|
||||
let mut max_cutoff_distance = None;
|
||||
|
||||
// this will probably conflict with on-demand downloaded layers, or at least force them all
|
||||
// to be downloaded
|
||||
for timeline in timelines {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
|
||||
let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
|
||||
// there's a race between the update (holding tenant.gc_lock) and this read but it
|
||||
// might not be an issue, because it's not for Timeline::gc
|
||||
let gc_info = timeline.gc_info.read().unwrap();
|
||||
|
||||
// similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
|
||||
// new gc run, which we have no control over. however differently from `Timeline::gc`
|
||||
// we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
|
||||
// actually removing files.
|
||||
let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
|
||||
|
||||
// the minimum where we should find the next_gc_cutoff for our calculations.
|
||||
//
|
||||
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
|
||||
// want to query any logical size before initdb_lsn.
|
||||
let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn);
|
||||
|
||||
let maybe_cutoff = if next_gc_cutoff > cutoff_minimum {
|
||||
Some((next_gc_cutoff, LsnKind::GcCutOff))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// this assumes there are no other lsns than the branchpoints
|
||||
let lsns = gc_info
|
||||
.retain_lsns
|
||||
.iter()
|
||||
.inspect(|&&lsn| {
|
||||
trace!(
|
||||
timeline_id=%timeline.timeline_id,
|
||||
"retained lsn: {lsn:?}, is_before_ancestor_lsn={}",
|
||||
lsn < timeline.get_ancestor_lsn()
|
||||
)
|
||||
})
|
||||
.filter(|&&lsn| lsn > timeline.get_ancestor_lsn())
|
||||
.copied()
|
||||
.map(|lsn| (lsn, LsnKind::BranchPoint))
|
||||
.chain(maybe_cutoff)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
(
|
||||
lsns,
|
||||
gc_info.horizon_cutoff,
|
||||
gc_info.pitr_cutoff,
|
||||
next_gc_cutoff,
|
||||
)
|
||||
};
|
||||
|
||||
// update this to have a retention_period later for the tenant_size_model
|
||||
// tenant_size_model compares this to the last segments start_lsn
|
||||
if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) {
|
||||
match max_cutoff_distance.as_mut() {
|
||||
Some(max) => {
|
||||
*max = std::cmp::max(*max, cutoff_distance);
|
||||
}
|
||||
_ => {
|
||||
max_cutoff_distance = Some(cutoff_distance);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// all timelines branch from something, because it might be impossible to pinpoint
|
||||
// which is the tenant_size_model's "default" branch.
|
||||
updates.push(Update {
|
||||
lsn: timeline.get_ancestor_lsn(),
|
||||
command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
|
||||
timeline_id: timeline.timeline_id,
|
||||
});
|
||||
|
||||
for (lsn, _kind) in &interesting_lsns {
|
||||
if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
|
||||
updates.push(Update {
|
||||
lsn: *lsn,
|
||||
timeline_id: timeline.timeline_id,
|
||||
command: Command::Update(*size),
|
||||
});
|
||||
|
||||
needed_cache.insert((timeline.timeline_id, *lsn));
|
||||
} else {
|
||||
let timeline = Arc::clone(&timeline);
|
||||
let parallel_size_calcs = Arc::clone(limit);
|
||||
joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn));
|
||||
}
|
||||
}
|
||||
|
||||
// all timelines also have an end point if they have made any progress
|
||||
if last_record_lsn > timeline.get_ancestor_lsn()
|
||||
&& !interesting_lsns
|
||||
.iter()
|
||||
.any(|(lsn, _)| lsn == &last_record_lsn)
|
||||
{
|
||||
updates.push(Update {
|
||||
lsn: last_record_lsn,
|
||||
command: Command::EndOfBranch,
|
||||
timeline_id: timeline.timeline_id,
|
||||
});
|
||||
}
|
||||
|
||||
timeline_inputs.insert(
|
||||
timeline.timeline_id,
|
||||
TimelineInputs {
|
||||
last_record: last_record_lsn,
|
||||
// this is not used above, because it might not have updated recently enough
|
||||
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
next_gc_cutoff,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
let mut have_any_error = false;
|
||||
|
||||
while let Some(res) = joinset.join_next().await {
|
||||
// each of these come with Result<Result<_, JoinError>, JoinError>
|
||||
// because of spawn + spawn_blocking
|
||||
let res = res.and_then(|inner| inner);
|
||||
match res {
|
||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
|
||||
debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
|
||||
|
||||
logical_size_cache.insert((timeline.timeline_id, lsn), size);
|
||||
needed_cache.insert((timeline.timeline_id, lsn));
|
||||
|
||||
updates.push(Update {
|
||||
lsn,
|
||||
timeline_id: timeline.timeline_id,
|
||||
command: Command::Update(size),
|
||||
});
|
||||
}
|
||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
|
||||
warn!(
|
||||
timeline_id=%timeline.timeline_id,
|
||||
"failed to calculate logical size at {lsn}: {error:#}"
|
||||
);
|
||||
have_any_error = true;
|
||||
}
|
||||
Err(join_error) if join_error.is_cancelled() => {
|
||||
unreachable!("we are not cancelling any of the futures, nor should be");
|
||||
}
|
||||
Err(join_error) => {
|
||||
// cannot really do anything, as this panic is likely a bug
|
||||
error!("logical size query panicked: {join_error:#}");
|
||||
have_any_error = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// prune any keys not needed anymore; we record every used key and added key.
|
||||
logical_size_cache.retain(|key, _| needed_cache.contains(key));
|
||||
|
||||
if have_any_error {
|
||||
// we cannot complete this round, because we are missing data.
|
||||
// we have however cached all we were able to request calculation on.
|
||||
anyhow::bail!("failed to calculate some logical_sizes");
|
||||
}
|
||||
|
||||
// the data gathered to updates is per lsn, regardless of the branch, so we can use it to
|
||||
// our advantage, not requiring a sorted container or graph walk.
|
||||
//
|
||||
// for branch points, which come as multiple updates at the same LSN, the Command::Update
|
||||
// is needed before a branch is made out of that branch Command::BranchFrom. this is
|
||||
// handled by the variant order in `Command`.
|
||||
updates.sort_unstable();
|
||||
|
||||
let retention_period = match max_cutoff_distance {
|
||||
Some(max) => max.0,
|
||||
None => {
|
||||
anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0")
|
||||
}
|
||||
};
|
||||
|
||||
Ok(ModelInputs {
|
||||
updates,
|
||||
retention_period,
|
||||
timeline_inputs,
|
||||
})
|
||||
}
|
||||
|
||||
impl ModelInputs {
|
||||
pub fn calculate(&self) -> anyhow::Result<u64> {
|
||||
// Option<TimelineId> is used for "naming" the branches because it is assumed to be
|
||||
// impossible to always determine the a one main branch.
|
||||
let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);
|
||||
|
||||
for update in &self.updates {
|
||||
let Update {
|
||||
lsn,
|
||||
command: op,
|
||||
timeline_id,
|
||||
} = update;
|
||||
let Lsn(now) = *lsn;
|
||||
match op {
|
||||
Command::Update(sz) => {
|
||||
storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz));
|
||||
}
|
||||
Command::EndOfBranch => {
|
||||
storage.insert_point(&Some(*timeline_id), "".into(), now, None);
|
||||
}
|
||||
Command::BranchFrom(parent) => {
|
||||
storage.branch(parent, Some(*timeline_id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(storage.calculate(self.retention_period).total_children())
|
||||
}
|
||||
}
|
||||
|
||||
/// A point of interest in the tree of branches
|
||||
#[serde_with::serde_as]
|
||||
#[derive(
|
||||
Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
|
||||
)]
|
||||
struct Update {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
lsn: utils::lsn::Lsn,
|
||||
command: Command,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
enum Command {
|
||||
Update(u64),
|
||||
BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
|
||||
EndOfBranch,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Command {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3
|
||||
// linebreaks
|
||||
match self {
|
||||
Self::Update(arg0) => write!(f, "Update({arg0})"),
|
||||
Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
|
||||
Self::EndOfBranch => write!(f, "EndOfBranch"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum LsnKind {
|
||||
BranchPoint,
|
||||
GcCutOff,
|
||||
}
|
||||
|
||||
/// Newtype around the tuple that carries the timeline at lsn logical size calculation.
|
||||
struct TimelineAtLsnSizeResult(
|
||||
Arc<crate::tenant::Timeline>,
|
||||
utils::lsn::Lsn,
|
||||
anyhow::Result<u64>,
|
||||
);
|
||||
|
||||
#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
|
||||
async fn calculate_logical_size(
|
||||
limit: Arc<tokio::sync::Semaphore>,
|
||||
timeline: Arc<crate::tenant::Timeline>,
|
||||
lsn: utils::lsn::Lsn,
|
||||
) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
|
||||
let permit = tokio::sync::Semaphore::acquire_owned(limit)
|
||||
.await
|
||||
.expect("global semaphore should not had been closed");
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _permit = permit;
|
||||
let size_res = timeline.calculate_logical_size(lsn);
|
||||
TimelineAtLsnSizeResult(timeline, lsn, size_res)
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn updates_sort() {
|
||||
use std::str::FromStr;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
let ids = [
|
||||
TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(),
|
||||
TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(),
|
||||
TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(),
|
||||
];
|
||||
|
||||
// try through all permutations
|
||||
let ids = [
|
||||
[&ids[0], &ids[1], &ids[2]],
|
||||
[&ids[0], &ids[2], &ids[1]],
|
||||
[&ids[1], &ids[0], &ids[2]],
|
||||
[&ids[1], &ids[2], &ids[0]],
|
||||
[&ids[2], &ids[0], &ids[1]],
|
||||
[&ids[2], &ids[1], &ids[0]],
|
||||
];
|
||||
|
||||
for ids in ids {
|
||||
// apply a fixture which uses a permutation of ids
|
||||
let commands = [
|
||||
Update {
|
||||
lsn: Lsn(0),
|
||||
command: Command::BranchFrom(None),
|
||||
timeline_id: *ids[0],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/67E7618").unwrap(),
|
||||
command: Command::Update(43696128),
|
||||
timeline_id: *ids[0],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/67E7618").unwrap(),
|
||||
command: Command::BranchFrom(Some(*ids[0])),
|
||||
timeline_id: *ids[1],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/76BE4F0").unwrap(),
|
||||
command: Command::Update(41844736),
|
||||
timeline_id: *ids[1],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/10E49380").unwrap(),
|
||||
command: Command::Update(42164224),
|
||||
timeline_id: *ids[0],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/10E49380").unwrap(),
|
||||
command: Command::BranchFrom(Some(*ids[0])),
|
||||
timeline_id: *ids[2],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/11D74910").unwrap(),
|
||||
command: Command::Update(42172416),
|
||||
timeline_id: *ids[2],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/12051E98").unwrap(),
|
||||
command: Command::Update(42196992),
|
||||
timeline_id: *ids[0],
|
||||
},
|
||||
];
|
||||
|
||||
let mut sorted = commands;
|
||||
|
||||
// these must sort in the same order, regardless of how the ids sort
|
||||
// which is why the timeline_id is the last field
|
||||
sorted.sort_unstable();
|
||||
|
||||
assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_size_for_multiple_branches() {
|
||||
// this is generated from integration test test_tenant_size_with_multiple_branches, but this way
|
||||
// it has the stable lsn's
|
||||
let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
|
||||
|
||||
let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
|
||||
|
||||
assert_eq!(inputs.calculate().unwrap(), 36_409_872);
|
||||
}
|
||||
@@ -16,7 +16,7 @@ use std::fs;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use crate::tenant::{
|
||||
@@ -34,7 +34,6 @@ use crate::tenant::{
|
||||
use crate::config::{PageServerConf, METADATA_FILE_NAME};
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::metrics::TimelineMetrics;
|
||||
use crate::page_image_cache;
|
||||
use crate::pgdatadir_mapping::BlockNumber;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
@@ -62,6 +61,13 @@ use crate::{
|
||||
storage_sync::{self, index::LayerFileMetadata},
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
enum FlushLoopState {
|
||||
NotStarted,
|
||||
Running,
|
||||
Exited,
|
||||
}
|
||||
|
||||
pub struct Timeline {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
@@ -122,8 +128,16 @@ pub struct Timeline {
|
||||
/// to avoid deadlock.
|
||||
write_lock: Mutex<()>,
|
||||
|
||||
/// Used to ensure that there is only task performing flushing at a time
|
||||
layer_flush_lock: Mutex<()>,
|
||||
/// Used to avoid multiple `flush_loop` tasks running
|
||||
flush_loop_state: Mutex<FlushLoopState>,
|
||||
|
||||
/// layer_flush_start_tx can be used to wake up the layer-flushing task.
|
||||
/// The value is a counter, incremented every time a new flush cycle is requested.
|
||||
/// The flush cycle counter is sent back on the layer_flush_done channel when
|
||||
/// the flush finishes. You can use that to wait for the flush to finish.
|
||||
layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
|
||||
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
|
||||
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
|
||||
|
||||
/// Layer removal lock.
|
||||
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
||||
@@ -273,6 +287,11 @@ impl LogicalSize {
|
||||
self.size_added_after_initial
|
||||
.fetch_add(delta, AtomicOrdering::SeqCst);
|
||||
}
|
||||
|
||||
/// Returns the initialized (already calculated) value, if any.
|
||||
fn initialized_size(&self) -> Option<u64> {
|
||||
self.initial_logical_size.get().copied()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WalReceiverInfo {
|
||||
@@ -462,15 +481,16 @@ impl Timeline {
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
pub fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
|
||||
pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
|
||||
match cconf {
|
||||
CheckpointConfig::Flush => {
|
||||
self.freeze_inmem_layer(false);
|
||||
self.flush_frozen_layers(true)
|
||||
self.flush_frozen_layers_and_wait().await
|
||||
}
|
||||
CheckpointConfig::Forced => {
|
||||
self.freeze_inmem_layer(false);
|
||||
self.flush_frozen_layers(true)?;
|
||||
self.flush_frozen_layers_and_wait().await?;
|
||||
self.compact()
|
||||
}
|
||||
}
|
||||
@@ -620,24 +640,8 @@ impl Timeline {
|
||||
self.last_freeze_at.store(last_lsn);
|
||||
*(self.last_freeze_ts.write().unwrap()) = Instant::now();
|
||||
|
||||
// Launch a task to flush the frozen layer to disk, unless
|
||||
// a task was already running. (If the task was running
|
||||
// at the time that we froze the layer, it must've seen the
|
||||
// the layer we just froze before it exited; see comments
|
||||
// in flush_frozen_layers())
|
||||
if let Ok(guard) = self.layer_flush_lock.try_lock() {
|
||||
drop(guard);
|
||||
let self_clone = Arc::clone(self);
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::LayerFlushTask,
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
"layer flush task",
|
||||
false,
|
||||
async move { self_clone.flush_frozen_layers(false) },
|
||||
);
|
||||
}
|
||||
// Wake up the layer flusher
|
||||
self.flush_frozen_layers();
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -728,6 +732,9 @@ impl Timeline {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
let (state, _) = watch::channel(TimelineState::Suspended);
|
||||
|
||||
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
|
||||
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
|
||||
|
||||
let mut result = Timeline {
|
||||
conf,
|
||||
tenant_conf,
|
||||
@@ -755,8 +762,12 @@ impl Timeline {
|
||||
|
||||
upload_layers: AtomicBool::new(upload_layers),
|
||||
|
||||
flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
|
||||
|
||||
layer_flush_start_tx,
|
||||
layer_flush_done_tx,
|
||||
|
||||
write_lock: Mutex::new(()),
|
||||
layer_flush_lock: Mutex::new(()),
|
||||
layer_removal_cs: Mutex::new(()),
|
||||
|
||||
gc_info: RwLock::new(GcInfo {
|
||||
@@ -789,6 +800,48 @@ impl Timeline {
|
||||
result
|
||||
}
|
||||
|
||||
pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
|
||||
let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
|
||||
match *flush_loop_state {
|
||||
FlushLoopState::NotStarted => (),
|
||||
FlushLoopState::Running => {
|
||||
info!(
|
||||
"skipping attempt to start flush_loop twice {}/{}",
|
||||
self.tenant_id, self.timeline_id
|
||||
);
|
||||
return;
|
||||
}
|
||||
FlushLoopState::Exited => {
|
||||
warn!(
|
||||
"ignoring attempt to restart exited flush_loop {}/{}",
|
||||
self.tenant_id, self.timeline_id
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
|
||||
let self_clone = Arc::clone(self);
|
||||
info!("spawning flush loop");
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::LayerFlushTask,
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
"layer flush task",
|
||||
false,
|
||||
async move {
|
||||
self_clone.flush_loop(layer_flush_start_rx).await;
|
||||
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
|
||||
assert_eq!(*flush_loop_state, FlushLoopState::Running);
|
||||
*flush_loop_state = FlushLoopState::Exited;
|
||||
Ok(()) }
|
||||
.instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
|
||||
);
|
||||
|
||||
*flush_loop_state = FlushLoopState::Running;
|
||||
}
|
||||
|
||||
pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
|
||||
if !is_etcd_client_initialized() {
|
||||
if cfg!(test) {
|
||||
@@ -980,9 +1033,26 @@ impl Timeline {
|
||||
/// Calculate the logical size of the database at the latest LSN.
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors, this can be a slow operation.
|
||||
fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
|
||||
info!("Calculating logical size for timeline {}", self.timeline_id);
|
||||
let timer = self.metrics.init_logical_size_histo.start_timer();
|
||||
pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
|
||||
info!(
|
||||
"Calculating logical size for timeline {} at {}",
|
||||
self.timeline_id, up_to_lsn
|
||||
);
|
||||
let timer = if up_to_lsn == self.initdb_lsn {
|
||||
if let Some(size) = self.current_logical_size.initialized_size() {
|
||||
if size != 0 {
|
||||
// non-zero size means that the size has already been calculated by this method
|
||||
// after startup. if the logical size is for a new timeline without layers the
|
||||
// size will be zero, and we cannot use that, or this caching strategy until
|
||||
// pageserver restart.
|
||||
return Ok(size);
|
||||
}
|
||||
}
|
||||
|
||||
self.metrics.init_logical_size_histo.start_timer()
|
||||
} else {
|
||||
self.metrics.logical_size_histo.start_timer()
|
||||
};
|
||||
let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?;
|
||||
debug!("calculated logical size: {logical_size}");
|
||||
timer.stop_and_record();
|
||||
@@ -1268,53 +1338,95 @@ impl Timeline {
|
||||
drop(layers);
|
||||
}
|
||||
|
||||
/// Flush all frozen layers to disk.
|
||||
///
|
||||
/// Only one task at a time can be doing layer-flushing for a
|
||||
/// given timeline. If 'wait' is true, and another task is
|
||||
/// currently doing the flushing, this function will wait for it
|
||||
/// to finish. If 'wait' is false, this function will return
|
||||
/// immediately instead.
|
||||
fn flush_frozen_layers(&self, wait: bool) -> anyhow::Result<()> {
|
||||
let flush_lock_guard = if wait {
|
||||
self.layer_flush_lock.lock().unwrap()
|
||||
} else {
|
||||
match self.layer_flush_lock.try_lock() {
|
||||
Ok(guard) => guard,
|
||||
Err(TryLockError::WouldBlock) => return Ok(()),
|
||||
Err(TryLockError::Poisoned(err)) => panic!("{:?}", err),
|
||||
}
|
||||
};
|
||||
|
||||
let timer = self.metrics.flush_time_histo.start_timer();
|
||||
|
||||
/// Layer flusher task's main loop.
|
||||
async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>) {
|
||||
info!("started flush loop");
|
||||
loop {
|
||||
let layers = self.layers.read().unwrap();
|
||||
if let Some(frozen_layer) = layers.frozen_layers.front() {
|
||||
let frozen_layer = Arc::clone(frozen_layer);
|
||||
drop(layers); // to allow concurrent reads and writes
|
||||
self.flush_frozen_layer(frozen_layer)?;
|
||||
} else {
|
||||
// Drop the 'layer_flush_lock' *before* 'layers'. That
|
||||
// way, if you freeze a layer, and then call
|
||||
// flush_frozen_layers(false), it is guaranteed that
|
||||
// if another thread was busy flushing layers and the
|
||||
// call therefore returns immediately, the other
|
||||
// thread will have seen the newly-frozen layer and
|
||||
// will flush that too (assuming no errors).
|
||||
drop(flush_lock_guard);
|
||||
drop(layers);
|
||||
break;
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("shutting down layer flush task");
|
||||
break;
|
||||
},
|
||||
_ = layer_flush_start_rx.changed() => {}
|
||||
}
|
||||
|
||||
trace!("waking up");
|
||||
let timer = self.metrics.flush_time_histo.start_timer();
|
||||
let flush_counter = *layer_flush_start_rx.borrow();
|
||||
let result = loop {
|
||||
let layer_to_flush = {
|
||||
let layers = self.layers.read().unwrap();
|
||||
layers.frozen_layers.front().cloned()
|
||||
// drop 'layers' lock to allow concurrent reads and writes
|
||||
};
|
||||
if let Some(layer_to_flush) = layer_to_flush {
|
||||
if let Err(err) = self.flush_frozen_layer(layer_to_flush).await {
|
||||
error!("could not flush frozen layer: {err:?}");
|
||||
break Err(err);
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
break Ok(());
|
||||
}
|
||||
};
|
||||
// Notify any listeners that we're done
|
||||
let _ = self
|
||||
.layer_flush_done_tx
|
||||
.send_replace((flush_counter, result));
|
||||
|
||||
timer.stop_and_record();
|
||||
}
|
||||
}
|
||||
|
||||
async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
|
||||
let mut rx = self.layer_flush_done_tx.subscribe();
|
||||
|
||||
// Increment the flush cycle counter and wake up the flush task.
|
||||
// Remember the new value, so that when we listen for the flush
|
||||
// to finish, we know when the flush that we initiated has
|
||||
// finished, instead of some other flush that was started earlier.
|
||||
let mut my_flush_request = 0;
|
||||
|
||||
let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
|
||||
if flush_loop_state != FlushLoopState::Running {
|
||||
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
|
||||
}
|
||||
|
||||
timer.stop_and_record();
|
||||
self.layer_flush_start_tx.send_modify(|counter| {
|
||||
my_flush_request = *counter + 1;
|
||||
*counter = my_flush_request;
|
||||
});
|
||||
|
||||
Ok(())
|
||||
loop {
|
||||
{
|
||||
let (last_result_counter, last_result) = &*rx.borrow();
|
||||
if *last_result_counter >= my_flush_request {
|
||||
if let Err(_err) = last_result {
|
||||
// We already logged the original error in
|
||||
// flush_loop. We cannot propagate it to the caller
|
||||
// here, because it might not be Cloneable
|
||||
anyhow::bail!(
|
||||
"Could not flush frozen layer. Request id: {}",
|
||||
my_flush_request
|
||||
);
|
||||
} else {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
trace!("waiting for flush to complete");
|
||||
rx.changed().await?;
|
||||
trace!("done")
|
||||
}
|
||||
}
|
||||
|
||||
fn flush_frozen_layers(&self) {
|
||||
self.layer_flush_start_tx.send_modify(|val| *val += 1);
|
||||
}
|
||||
|
||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||
fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
|
||||
#[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.filename().display()))]
|
||||
async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
|
||||
// As a special case, when we have just imported an image into the repository,
|
||||
// instead of writing out a L0 delta layer, we directly write out image layer
|
||||
// files instead. This is possible as long as *all* the data imported into the
|
||||
@@ -1542,6 +1654,10 @@ impl Timeline {
|
||||
lsn,
|
||||
)?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
anyhow::bail!("failpoint image-layer-writer-fail-before-finish");
|
||||
});
|
||||
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
@@ -1836,6 +1952,11 @@ impl Timeline {
|
||||
},
|
||||
)?);
|
||||
}
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
}
|
||||
@@ -2235,13 +2356,10 @@ impl Timeline {
|
||||
|
||||
let last_rec_lsn = data.records.last().unwrap().0;
|
||||
|
||||
let img = self.walredo_mgr.request_redo(
|
||||
key,
|
||||
request_lsn,
|
||||
base_img,
|
||||
data.records,
|
||||
self.pg_version,
|
||||
)?;
|
||||
let img = self
|
||||
.walredo_mgr
|
||||
.request_redo(key, request_lsn, base_img, data.records, self.pg_version)
|
||||
.context("Failed to reconstruct a page image:")?;
|
||||
|
||||
if img.len() == page_cache::PAGE_SZ {
|
||||
let cache = page_cache::get();
|
||||
@@ -2316,7 +2434,6 @@ impl<'a> TimelineWriter<'a> {
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
|
||||
page_image_cache::remove(key, self.tenant_id, self.timeline_id);
|
||||
self.tl.put_value(key, lsn, value)
|
||||
}
|
||||
|
||||
|
||||
@@ -82,6 +82,7 @@ pub struct TenantConf {
|
||||
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
|
||||
/// to avoid eager reconnects.
|
||||
pub max_lsn_wal_lag: NonZeroU64,
|
||||
pub trace_read_requests: bool,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -105,6 +106,7 @@ pub struct TenantConfOpt {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub lagging_wal_timeout: Option<Duration>,
|
||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
pub trace_read_requests: Option<bool>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -138,6 +140,9 @@ impl TenantConfOpt {
|
||||
.lagging_wal_timeout
|
||||
.unwrap_or(global_conf.lagging_wal_timeout),
|
||||
max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
|
||||
trace_read_requests: self
|
||||
.trace_read_requests
|
||||
.unwrap_or(global_conf.trace_read_requests),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -207,10 +212,10 @@ impl TenantConf {
|
||||
.expect("cannot parse default walreceiver lagging wal timeout"),
|
||||
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
||||
trace_read_requests: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn dummy_conf() -> Self {
|
||||
TenantConf {
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
@@ -232,6 +237,7 @@ impl TenantConf {
|
||||
.unwrap(),
|
||||
max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||
.unwrap(),
|
||||
trace_read_requests: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -241,7 +241,7 @@ pub async fn shutdown_all_tenants() {
|
||||
let tenant_id = tenant.tenant_id();
|
||||
debug!("shutdown tenant {tenant_id}");
|
||||
|
||||
if let Err(err) = tenant.checkpoint() {
|
||||
if let Err(err) = tenant.checkpoint().await {
|
||||
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,9 +71,7 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
let mut sleep_duration = tenant.get_compaction_period();
|
||||
if let Err(e) = tenant.compaction_iteration() {
|
||||
sleep_duration = wait_duration;
|
||||
error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
|
||||
#[cfg(feature = "testing")]
|
||||
std::process::abort();
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
}
|
||||
|
||||
// Sleep
|
||||
@@ -119,12 +117,10 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let mut sleep_duration = gc_period;
|
||||
if gc_horizon > 0 {
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false)
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
|
||||
{
|
||||
sleep_duration = wait_duration;
|
||||
error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
|
||||
#[cfg(feature = "testing")]
|
||||
std::process::abort();
|
||||
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
36
pageserver/src/trace.rs
Normal file
36
pageserver/src/trace.rs
Normal file
@@ -0,0 +1,36 @@
|
||||
use bytes::Bytes;
|
||||
use std::{
|
||||
fs::{create_dir_all, File},
|
||||
io::{BufWriter, Write},
|
||||
path::PathBuf,
|
||||
};
|
||||
|
||||
pub struct Tracer {
|
||||
writer: BufWriter<File>,
|
||||
}
|
||||
|
||||
impl Drop for Tracer {
|
||||
fn drop(&mut self) {
|
||||
self.flush()
|
||||
}
|
||||
}
|
||||
|
||||
impl Tracer {
|
||||
pub fn new(path: PathBuf) -> Self {
|
||||
let parent = path.parent().expect("failed to parse parent path");
|
||||
create_dir_all(parent).expect("failed to create trace dir");
|
||||
|
||||
let file = File::create(path).expect("failed to create trace file");
|
||||
Tracer {
|
||||
writer: BufWriter::new(file),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn trace(&mut self, msg: &Bytes) {
|
||||
self.writer.write_all(msg).expect("failed to write trace");
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) {
|
||||
self.writer.flush().expect("failed to flush trace file");
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user