mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 07:00:38 +00:00
Compare commits
46 Commits
seqscan-pe
...
jk/cleanup
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ca1ed3dc3b | ||
|
|
dc2554dff6 | ||
|
|
5112142997 | ||
|
|
a0a74868a4 | ||
|
|
b154992510 | ||
|
|
a86a38c96e | ||
|
|
590f894db8 | ||
|
|
0a0595b98d | ||
|
|
e56d11c8e1 | ||
|
|
ccdc3188ed | ||
|
|
67401cbdb8 | ||
|
|
d42700280f | ||
|
|
6df4d5c911 | ||
|
|
32d14403bd | ||
|
|
0df3467146 | ||
|
|
c64a121aa8 | ||
|
|
22cc8760b9 | ||
|
|
596d622a82 | ||
|
|
7481fb082c | ||
|
|
1eb9bd052a | ||
|
|
59a3ca4ec6 | ||
|
|
e86a9105a4 | ||
|
|
d3c8749da5 | ||
|
|
128dc8d405 | ||
|
|
0cbae6e8f3 | ||
|
|
78e412b84b | ||
|
|
6dbf202e0d | ||
|
|
b42bf9265a | ||
|
|
1f08ba5790 | ||
|
|
0c54eb65fb | ||
|
|
259a5f356e | ||
|
|
a3cb8c11e0 | ||
|
|
9fb2287f87 | ||
|
|
834ffe1bac | ||
|
|
df18b041c0 | ||
|
|
39897105b2 | ||
|
|
2f399f08b2 | ||
|
|
9f49605041 | ||
|
|
7b6431cbd7 | ||
|
|
321aeac3d4 | ||
|
|
71ef7b6663 | ||
|
|
5928cb33c5 | ||
|
|
6ff2c61ae0 | ||
|
|
7480a0338a | ||
|
|
2709878b8b | ||
|
|
39e4bdb99e |
19
.github/actions/run-python-test-set/action.yml
vendored
19
.github/actions/run-python-test-set/action.yml
vendored
@@ -73,6 +73,14 @@ runs:
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Download compatibility snapshot for Postgres 14
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg14
|
||||
path: /tmp/compatibility_snapshot_pg14
|
||||
prefix: latest
|
||||
|
||||
- name: Run pytest
|
||||
env:
|
||||
NEON_BIN: /tmp/neon/bin
|
||||
@@ -80,6 +88,8 @@ runs:
|
||||
BUILD_TYPE: ${{ inputs.build_type }}
|
||||
AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
|
||||
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
|
||||
ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# PLATFORM will be embedded in the perf test report
|
||||
@@ -154,6 +164,15 @@ runs:
|
||||
scripts/generate_and_push_perf_report.sh
|
||||
fi
|
||||
|
||||
- name: Upload compatibility snapshot for Postgres 14
|
||||
if: github.ref_name == 'release'
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
|
||||
# The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
|
||||
path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
|
||||
prefix: latest
|
||||
|
||||
- name: Create Allure report
|
||||
if: always()
|
||||
uses: ./.github/actions/allure-report
|
||||
|
||||
2
.github/ansible/neon-stress.hosts.yaml
vendored
2
.github/ansible/neon-stress.hosts.yaml
vendored
@@ -3,7 +3,6 @@ storage:
|
||||
bucket_name: neon-storage-ireland
|
||||
bucket_region: eu-west-1
|
||||
console_mgmt_base_url: http://neon-stress-console.local
|
||||
env_name: neon-stress
|
||||
etcd_endpoints: neon-stress-etcd.local:2379
|
||||
safekeeper_enable_s3_offload: 'false'
|
||||
pageserver_config_stub:
|
||||
@@ -12,6 +11,7 @@ storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||
safekeeper_s3_prefix: neon-stress/wal
|
||||
hostname_suffix: ".local"
|
||||
remote_user: admin
|
||||
children:
|
||||
|
||||
35
.github/ansible/prod.ap-southeast-1.hosts.yaml
vendored
Normal file
35
.github/ansible/prod.ap-southeast-1.hosts.yaml
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-ap-southeast-1
|
||||
bucket_region: ap-southeast-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: ap-southeast-1
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
|
||||
console_region_id: aws-ap-southeast-1
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-064de8ea28bdb495b
|
||||
pageserver-1.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0b180defcaeeb6b93
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0d6f1dc5161eef894
|
||||
safekeeper-1.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0e338adda8eb2d19f
|
||||
safekeeper-2.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-04fb63634e4679eb9
|
||||
35
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
Normal file
35
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-eu-central-1
|
||||
bucket_region: eu-central-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: eu-central-1
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
|
||||
console_region_id: aws-eu-central-1
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-0cd8d316ecbb715be
|
||||
pageserver-1.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-090044ed3d383fef0
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-0b238612d2318a050
|
||||
safekeeper-1.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-07b9c45e5c2637cd4
|
||||
safekeeper-2.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-020257302c3c93d88
|
||||
36
.github/ansible/prod.us-east-2.hosts.yaml
vendored
Normal file
36
.github/ansible/prod.us-east-2.hosts.yaml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: us-east-2
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
|
||||
console_region_id: aws-us-east-2
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-062227ba7f119eb8c
|
||||
pageserver-1.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-0b3ec0afab5968938
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-0e94224750c57d346
|
||||
safekeeper-1.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-06d113fb73bfddeb0
|
||||
safekeeper-2.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-09f66c8e04afff2e8
|
||||
|
||||
2
.github/ansible/production.hosts.yaml
vendored
2
.github/ansible/production.hosts.yaml
vendored
@@ -1,7 +1,6 @@
|
||||
---
|
||||
storage:
|
||||
vars:
|
||||
env_name: prod-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
bucket_name: zenith-storage-oregon
|
||||
bucket_region: us-west-2
|
||||
@@ -12,6 +11,7 @@ storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||
safekeeper_s3_prefix: prod-1/wal
|
||||
hostname_suffix: ".local"
|
||||
remote_user: admin
|
||||
|
||||
|
||||
1
.github/ansible/ssm_config
vendored
1
.github/ansible/ssm_config
vendored
@@ -1,3 +1,2 @@
|
||||
ansible_connection: aws_ssm
|
||||
ansible_aws_ssm_bucket_name: neon-dev-bucket
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
|
||||
2
.github/ansible/staging.hosts.yaml
vendored
2
.github/ansible/staging.hosts.yaml
vendored
@@ -3,7 +3,6 @@ storage:
|
||||
bucket_name: zenith-staging-storage-us-east-1
|
||||
bucket_region: us-east-1
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
env_name: us-stage
|
||||
etcd_endpoints: zenith-us-stage-etcd.local:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
@@ -11,6 +10,7 @@ storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||
safekeeper_s3_prefix: us-stage/wal
|
||||
hostname_suffix: ".local"
|
||||
remote_user: admin
|
||||
|
||||
|
||||
3
.github/ansible/staging.us-east-2.hosts.yaml
vendored
3
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -3,7 +3,6 @@ storage:
|
||||
bucket_name: neon-staging-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
env_name: us-stage
|
||||
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
@@ -11,9 +10,11 @@ storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: us-east-2
|
||||
ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
|
||||
console_region_id: aws-us-east-2
|
||||
|
||||
children:
|
||||
|
||||
2
.github/ansible/systemd/safekeeper.service
vendored
2
.github/ansible/systemd/safekeeper.service
vendored
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
31
.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
vendored
Normal file
31
.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.build"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: dev
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
31
.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
vendored
Normal file
31
.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: ap-southeast-1
|
||||
zenith_region_slug: ap-southeast-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
31
.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
vendored
Normal file
31
.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.eu-central-1.aws.neon.tech"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: eu-central-1
|
||||
zenith_region_slug: eu-central-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
31
.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
vendored
Normal file
31
.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.tech"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
155
.github/workflows/build_and_test.yml
vendored
155
.github/workflows/build_and_test.yml
vendored
@@ -127,8 +127,8 @@ jobs:
|
||||
target/
|
||||
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||
key: |
|
||||
v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-
|
||||
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
@@ -389,7 +389,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git/
|
||||
target/
|
||||
key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
- name: Get Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -625,11 +625,11 @@ jobs:
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
run: |
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
|
||||
|
||||
- name: Configure Docker Hub login
|
||||
run: |
|
||||
@@ -756,9 +756,9 @@ jobs:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ us-east-2 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -781,7 +781,47 @@ jobs:
|
||||
fi
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
|
||||
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-prod-new:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
cd "$(pwd)/.github/ansible"
|
||||
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
./get_binaries.sh
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
RELEASE=true ./get_binaries.sh
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-proxy:
|
||||
@@ -825,3 +865,94 @@ jobs:
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
deploy-proxy-new:
|
||||
runs-on: dev
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'main') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: dev-us-east-2-beta
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Re-deploy proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
deploy-proxy-prod-new:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: prod-us-east-2-delta
|
||||
- target_region: eu-central-1
|
||||
target_cluster: prod-eu-central-1-gamma
|
||||
- target_region: ap-southeast-1
|
||||
target_cluster: prod-ap-southeast-1-epsilon
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Re-deploy proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
promote-compatibility-test-snapshot:
|
||||
runs-on: dev
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ deploy, deploy-proxy ]
|
||||
if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
|
||||
steps:
|
||||
- name: Promote compatibility snapshot for the release
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
PREFIX: artifacts/latest
|
||||
run: |
|
||||
for build_type in debug release; do
|
||||
OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
|
||||
NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
|
||||
|
||||
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
|
||||
done
|
||||
|
||||
2
.github/workflows/codestyle.yml
vendored
2
.github/workflows/codestyle.yml
vendored
@@ -106,7 +106,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
|
||||
key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
|
||||
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
|
||||
45
Cargo.lock
generated
45
Cargo.lock
generated
@@ -317,12 +317,6 @@ dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boxfnonce"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426"
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "1.0.1"
|
||||
@@ -600,6 +594,7 @@ dependencies = [
|
||||
"tar",
|
||||
"thiserror",
|
||||
"toml",
|
||||
"url",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -849,16 +844,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "daemonize"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815"
|
||||
dependencies = [
|
||||
"boxfnonce",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.14.1"
|
||||
@@ -2140,7 +2125,6 @@ dependencies = [
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"crossbeam-utils",
|
||||
"daemonize",
|
||||
"etcd_broker",
|
||||
"fail",
|
||||
"futures",
|
||||
@@ -2170,6 +2154,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"signal-hook",
|
||||
"svg_fmt",
|
||||
"tar",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
@@ -2188,7 +2173,10 @@ dependencies = [
|
||||
name = "pageserver_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"const_format",
|
||||
"postgres_ffi",
|
||||
"serde",
|
||||
"serde_with",
|
||||
"utils",
|
||||
@@ -3083,7 +3071,6 @@ dependencies = [
|
||||
"clap 4.0.15",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"daemonize",
|
||||
"etcd_broker",
|
||||
"fs2",
|
||||
"git-version",
|
||||
@@ -3091,6 +3078,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"nix 0.25.0",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"postgres",
|
||||
@@ -3461,6 +3449,12 @@ version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
|
||||
|
||||
[[package]]
|
||||
name = "svg_fmt"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
|
||||
|
||||
[[package]]
|
||||
name = "symbolic-common"
|
||||
version = "8.8.0"
|
||||
@@ -3932,6 +3926,16 @@ dependencies = [
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-serde"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.3.16"
|
||||
@@ -3942,12 +3946,15 @@ dependencies = [
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sharded-slab",
|
||||
"smallvec",
|
||||
"thread_local",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"tracing-serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4042,6 +4049,8 @@ dependencies = [
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"signal-hook",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
|
||||
@@ -44,7 +44,7 @@ COPY . .
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \
|
||||
&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin proxy --locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
# Build final image
|
||||
@@ -65,6 +65,7 @@ RUN set -e \
|
||||
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
|
||||
|
||||
@@ -1,50 +1,50 @@
|
||||
ARG TAG=pinned
|
||||
# apparently, ARGs don't get replaced in RUN commands in kaniko
|
||||
# ARG POSTGIS_VERSION=3.3.0
|
||||
# ARG PLV8_VERSION=3.1.4
|
||||
# ARG PG_VERSION=v14
|
||||
#
|
||||
# This file is identical to the Dockerfile.compute-node-v15 file
|
||||
# except for the version of Postgres that is built.
|
||||
#
|
||||
|
||||
ARG TAG=pinned
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "build-deps"
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
|
||||
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
|
||||
apt update
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
||||
libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-build"
|
||||
# Build Postgres from the neon postgres repository.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-build
|
||||
COPY vendor/postgres-v14 postgres
|
||||
RUN cd postgres && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "postgis-build"
|
||||
# Build PostGIS from the upstream PostGIS mirror.
|
||||
#
|
||||
# PostGIS compiles against neon postgres sources without changes. Perhaps we
|
||||
# could even use the upstream binaries, compiled against vanilla Postgres, but
|
||||
# it would require some investigation to check that it works, and also keeps
|
||||
# working in the future. So for now, we compile our own binaries.
|
||||
#########################################################################################
|
||||
FROM build-deps AS postgis-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
|
||||
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
|
||||
tar xvzf postgis-3.3.0.tar.gz && \
|
||||
cd postgis-3.3.0 && \
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
tar xvzf postgis-3.3.1.tar.gz && \
|
||||
cd postgis-3.3.1 && \
|
||||
./autogen.sh && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
./configure && \
|
||||
@@ -57,19 +57,29 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "plv8-build"
|
||||
# Build plv8
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
|
||||
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
|
||||
|
||||
# https://github.com/plv8/plv8/issues/475
|
||||
# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends -t testing binutils
|
||||
# https://github.com/plv8/plv8/issues/475:
|
||||
# v8 uses gold for linking and sets `--thread-count=4` which breaks
|
||||
# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
|
||||
# Install newer gold version manually as debian-testing binutils version updates
|
||||
# libc version, which in turn breaks other extension built against non-testing libc.
|
||||
RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
|
||||
tar xvzf binutils-2.38.tar.gz && \
|
||||
cd binutils-2.38 && \
|
||||
cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
cd ../bfd && ./configure && make bfdver.h && \
|
||||
cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
|
||||
cp /usr/local/bin/ld.gold /usr/bin/gold
|
||||
|
||||
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||
@@ -77,21 +87,25 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||
cd plv8-3.1.4 && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "h3-pg-build"
|
||||
# Build h3_pg
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS h3-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# packaged cmake is too old
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends -t testing cmake
|
||||
RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
|
||||
-q -O /tmp/cmake-install.sh \
|
||||
&& chmod u+x /tmp/cmake-install.sh \
|
||||
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
||||
&& rm /tmp/cmake-install.sh
|
||||
|
||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
|
||||
tar xvzf h3.tgz && \
|
||||
@@ -110,12 +124,15 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
# compile neon extensions
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
@@ -128,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
-C pgxn/neon \
|
||||
-s install
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Compile and run the Neon-specific `compute_ctl` binary
|
||||
#
|
||||
#########################################################################################
|
||||
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
|
||||
USER nonroot
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Clean up postgres folder before inclusion
|
||||
#
|
||||
#########################################################################################
|
||||
FROM neon-pg-ext-build AS postgres-cleanup-layer
|
||||
COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
|
||||
|
||||
@@ -155,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
|
||||
# if they were to be used by other libraries.
|
||||
RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Final layer
|
||||
# Put it all together into the final image
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim
|
||||
# Add user postgres
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
@@ -175,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
||||
# libreadline8 for psql
|
||||
# libossp-uuid16 for extension ossp-uuid
|
||||
# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
|
||||
# GLIBC 2.34 for plv8.
|
||||
# Debian bullseye provides GLIBC 2.31, so we install the library from testing
|
||||
#
|
||||
# Lastly, link compute_ctl into zenith_ctl while we're at it,
|
||||
# so that we don't need to put this in another layer.
|
||||
@@ -189,12 +212,6 @@ RUN apt update && \
|
||||
libproj19 \
|
||||
libprotobuf-c1 && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
echo "Installing GLIBC 2.34" && \
|
||||
echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
|
||||
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
|
||||
apt update && \
|
||||
apt install -y --no-install-recommends -t testing libc6 && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
|
||||
|
||||
USER postgres
|
||||
|
||||
@@ -4,44 +4,39 @@
|
||||
#
|
||||
|
||||
ARG TAG=pinned
|
||||
# apparently, ARGs don't get replaced in RUN commands in kaniko
|
||||
# ARG POSTGIS_VERSION=3.3.1
|
||||
# ARG PLV8_VERSION=3.1.4
|
||||
# ARG PG_VERSION=v15
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "build-deps"
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
|
||||
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
|
||||
apt update
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
||||
libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-build"
|
||||
# Build Postgres from the neon postgres repository.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-build
|
||||
COPY vendor/postgres-v15 postgres
|
||||
RUN cd postgres && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "postgis-build"
|
||||
# Build PostGIS from the upstream PostGIS mirror.
|
||||
#
|
||||
# PostGIS compiles against neon postgres sources without changes. Perhaps we
|
||||
# could even use the upstream binaries, compiled against vanilla Postgres, but
|
||||
# it would require some investigation to check that it works, and also keeps
|
||||
# working in the future. So for now, we compile our own binaries.
|
||||
#########################################################################################
|
||||
FROM build-deps AS postgis-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
@@ -62,19 +57,29 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "plv8-build"
|
||||
# Build plv8
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
|
||||
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
|
||||
|
||||
# https://github.com/plv8/plv8/issues/475
|
||||
# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends -t testing binutils
|
||||
# https://github.com/plv8/plv8/issues/475:
|
||||
# v8 uses gold for linking and sets `--thread-count=4` which breaks
|
||||
# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
|
||||
# Install newer gold version manually as debian-testing binutils version updates
|
||||
# libc version, which in turn breaks other extension built against non-testing libc.
|
||||
RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
|
||||
tar xvzf binutils-2.38.tar.gz && \
|
||||
cd binutils-2.38 && \
|
||||
cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
cd ../bfd && ./configure && make bfdver.h && \
|
||||
cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
|
||||
cp /usr/local/bin/ld.gold /usr/bin/gold
|
||||
|
||||
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||
@@ -82,21 +87,25 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||
cd plv8-3.1.4 && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "h3-pg-build"
|
||||
# Build h3_pg
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS h3-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# packaged cmake is too old
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends -t testing cmake
|
||||
RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
|
||||
-q -O /tmp/cmake-install.sh \
|
||||
&& chmod u+x /tmp/cmake-install.sh \
|
||||
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
||||
&& rm /tmp/cmake-install.sh
|
||||
|
||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
|
||||
tar xvzf h3.tgz && \
|
||||
@@ -115,12 +124,15 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
# compile neon extensions
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
@@ -133,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
-C pgxn/neon \
|
||||
-s install
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Compile and run the Neon-specific `compute_ctl` binary
|
||||
#
|
||||
#########################################################################################
|
||||
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
|
||||
USER nonroot
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Clean up postgres folder before inclusion
|
||||
#
|
||||
#########################################################################################
|
||||
FROM neon-pg-ext-build AS postgres-cleanup-layer
|
||||
COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
|
||||
|
||||
@@ -160,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
|
||||
# if they were to be used by other libraries.
|
||||
RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Final layer
|
||||
# Put it all together into the final image
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim
|
||||
# Add user postgres
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
@@ -180,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
||||
# libreadline8 for psql
|
||||
# libossp-uuid16 for extension ossp-uuid
|
||||
# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
|
||||
# GLIBC 2.34 for plv8.
|
||||
# Debian bullseye provides GLIBC 2.31, so we install the library from testing
|
||||
#
|
||||
# Lastly, link compute_ctl into zenith_ctl while we're at it,
|
||||
# so that we don't need to put this in another layer.
|
||||
@@ -194,12 +212,6 @@ RUN apt update && \
|
||||
libproj19 \
|
||||
libprotobuf-c1 && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
echo "Installing GLIBC 2.34" && \
|
||||
echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
|
||||
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
|
||||
apt update && \
|
||||
apt install -y --no-install-recommends -t testing libc6 && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
|
||||
|
||||
USER postgres
|
||||
|
||||
10
Makefile
10
Makefile
@@ -151,6 +151,11 @@ neon-pg-ext-v14: postgres-v14
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
|
||||
+@echo "Compiling neon_walredo v14"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
|
||||
+@echo "Compiling neon_test_utils" v14
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
|
||||
@@ -163,6 +168,11 @@ neon-pg-ext-v15: postgres-v15
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
|
||||
+@echo "Compiling neon_walredo v15"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
|
||||
+@echo "Compiling neon_test_utils" v15
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
|
||||
|
||||
@@ -223,10 +223,7 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
|
||||
# either:
|
||||
CARGO_BUILD_FLAGS="--features=testing" make
|
||||
# or:
|
||||
make debug
|
||||
|
||||
./scripts/pytest
|
||||
```
|
||||
|
||||
@@ -424,8 +424,29 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
db_client.simple_query(&alter_query)?;
|
||||
|
||||
// Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
|
||||
// This is needed since postgres 15, where this privilege is removed by default.
|
||||
let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string();
|
||||
// This is needed because since postgres 15 this privilege is removed by default.
|
||||
let grant_query = "DO $$\n\
|
||||
BEGIN\n\
|
||||
IF EXISTS(\n\
|
||||
SELECT nspname\n\
|
||||
FROM pg_catalog.pg_namespace\n\
|
||||
WHERE nspname = 'public'\n\
|
||||
) AND\n\
|
||||
current_setting('server_version_num')::int/10000 >= 15\n\
|
||||
THEN\n\
|
||||
IF EXISTS(\n\
|
||||
SELECT rolname\n\
|
||||
FROM pg_catalog.pg_roles\n\
|
||||
WHERE rolname = 'web_access'\n\
|
||||
)\n\
|
||||
THEN\n\
|
||||
GRANT CREATE ON SCHEMA public TO web_access;\n\
|
||||
END IF;\n\
|
||||
END IF;\n\
|
||||
END\n\
|
||||
$$;"
|
||||
.to_string();
|
||||
|
||||
info!("grant query for db {} : {}", &db.name, &grant_query);
|
||||
db_client.simple_query(&grant_query)?;
|
||||
}
|
||||
|
||||
@@ -4,20 +4,21 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
clap = "4.0"
|
||||
comfy-table = "6.1"
|
||||
git-version = "0.3.5"
|
||||
tar = "0.4.38"
|
||||
nix = "0.25"
|
||||
once_cell = "1.13.0"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
regex = "1"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "2.0"
|
||||
toml = "0.5"
|
||||
once_cell = "1.13.0"
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
tar = "0.4.38"
|
||||
thiserror = "1"
|
||||
nix = "0.25"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
toml = "0.5"
|
||||
url = "2.2.2"
|
||||
|
||||
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||
# instead, so that recompile times are better.
|
||||
|
||||
264
control_plane/src/background_process.rs
Normal file
264
control_plane/src/background_process.rs
Normal file
@@ -0,0 +1,264 @@
|
||||
//! Spawns and kills background processes that are needed by Neon CLI.
|
||||
//! Applies common set-up such as log and pid files (if needed) to every process.
|
||||
//!
|
||||
//! Neon CLI does not run in background, so it needs to store the information about
|
||||
//! spawned processes, which it does in this module.
|
||||
//! We do that by storing the pid of the process in the "${process_name}.pid" file.
|
||||
//! The pid file can be created by the process itself
|
||||
//! (Neon storage binaries do that and also ensure that a lock is taken onto that file)
|
||||
//! or we create such file after starting the process
|
||||
//! (non-Neon binaries don't necessarily follow our pidfile conventions).
|
||||
//! The pid stored in the file is later used to stop the service.
|
||||
//!
|
||||
//! See [`lock_file`] module for more info.
|
||||
|
||||
use std::ffi::OsStr;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::process::{Child, Command};
|
||||
use std::time::Duration;
|
||||
use std::{fs, io, thread};
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
|
||||
use utils::lock_file;
|
||||
|
||||
const RETRIES: u32 = 15;
|
||||
const RETRY_TIMEOUT_MILLIS: u64 = 500;
|
||||
|
||||
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
|
||||
/// it itself.
|
||||
pub enum InitialPidFile<'t> {
|
||||
/// Create a pidfile, to allow future CLI invocations to manipulate the process.
|
||||
Create(&'t Path),
|
||||
/// The process will create the pidfile itself, need to wait for that event.
|
||||
Expect(&'t Path),
|
||||
}
|
||||
|
||||
/// Start a background child process using the parameters given.
|
||||
pub fn start_process<F, S: AsRef<OsStr>>(
|
||||
process_name: &str,
|
||||
datadir: &Path,
|
||||
command: &Path,
|
||||
args: &[S],
|
||||
initial_pid_file: InitialPidFile,
|
||||
process_status_check: F,
|
||||
) -> anyhow::Result<Child>
|
||||
where
|
||||
F: Fn() -> anyhow::Result<bool>,
|
||||
{
|
||||
let log_path = datadir.join(format!("{process_name}.log"));
|
||||
let process_log_file = fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.append(true)
|
||||
.open(&log_path)
|
||||
.with_context(|| {
|
||||
format!("Could not open {process_name} log file {log_path:?} for writing")
|
||||
})?;
|
||||
let same_file_for_stderr = process_log_file.try_clone().with_context(|| {
|
||||
format!("Could not reuse {process_name} log file {log_path:?} for writing stderr")
|
||||
})?;
|
||||
|
||||
let mut command = Command::new(command);
|
||||
let background_command = command
|
||||
.stdout(process_log_file)
|
||||
.stderr(same_file_for_stderr)
|
||||
.args(args);
|
||||
let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
|
||||
|
||||
let mut spawned_process = filled_cmd.spawn().with_context(|| {
|
||||
format!("Could not spawn {process_name}, see console output and log files for details.")
|
||||
})?;
|
||||
let pid = spawned_process.id();
|
||||
let pid = Pid::from_raw(
|
||||
i32::try_from(pid)
|
||||
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
|
||||
);
|
||||
|
||||
let pid_file_to_check = match initial_pid_file {
|
||||
InitialPidFile::Create(target_pid_file_path) => {
|
||||
match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
|
||||
lock_file::LockCreationResult::Created { .. } => {
|
||||
// We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
|
||||
// as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
|
||||
}
|
||||
lock_file::LockCreationResult::AlreadyLocked { .. } => {
|
||||
anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
|
||||
}
|
||||
lock_file::LockCreationResult::CreationFailed(e) => {
|
||||
return Err(e.context(format!(
|
||||
"Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
|
||||
)))
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
|
||||
};
|
||||
|
||||
for retries in 0..RETRIES {
|
||||
match process_started(pid, pid_file_to_check, &process_status_check) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} started, pid: {pid}");
|
||||
return Ok(spawned_process);
|
||||
}
|
||||
Ok(false) => {
|
||||
if retries < 5 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
} else {
|
||||
if retries == 5 {
|
||||
println!() // put a line break after dots for second message
|
||||
}
|
||||
println!("{process_name} has not started yet, retrying ({retries})...");
|
||||
}
|
||||
thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{process_name} failed to start: {e:#}");
|
||||
if let Err(e) = spawned_process.kill() {
|
||||
println!("Could not stop {process_name} subprocess: {e:#}")
|
||||
};
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
|
||||
if !pid_file.exists() {
|
||||
println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
|
||||
return Ok(());
|
||||
}
|
||||
let pid = read_pidfile(pid_file)?;
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping {process_name} with pid {pid} immediately..");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping {process_name} with pid {pid} gracefully..");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(()) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!(
|
||||
"{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"),
|
||||
}
|
||||
|
||||
// Wait until process is gone
|
||||
for _ in 0..RETRIES {
|
||||
match process_has_stopped(pid) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} stopped");
|
||||
if let Err(e) = fs::remove_file(pid_file) {
|
||||
if e.kind() != io::ErrorKind::NotFound {
|
||||
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
Ok(false) => {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
thread::sleep(Duration::from_secs(1))
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{process_name} with pid {pid} failed to stop: {e:#}");
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
|
||||
}
|
||||
|
||||
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
|
||||
|
||||
let var = "LLVM_PROFILE_FILE";
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
filled_cmd = filled_cmd.env(var, val);
|
||||
}
|
||||
|
||||
const RUST_LOG_KEY: &str = "RUST_LOG";
|
||||
if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
|
||||
filled_cmd.env(RUST_LOG_KEY, rust_log_value)
|
||||
} else {
|
||||
filled_cmd
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in [
|
||||
"AWS_ACCESS_KEY_ID",
|
||||
"AWS_SECRET_ACCESS_KEY",
|
||||
"AWS_SESSION_TOKEN",
|
||||
] {
|
||||
if let Ok(value) = std::env::var(env_key) {
|
||||
cmd = cmd.env(env_key, value);
|
||||
}
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
fn process_started<F>(
|
||||
pid: Pid,
|
||||
pid_file_to_check: Option<&Path>,
|
||||
status_check: &F,
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
F: Fn() -> anyhow::Result<bool>,
|
||||
{
|
||||
match status_check() {
|
||||
Ok(true) => match pid_file_to_check {
|
||||
Some(pid_file_path) => {
|
||||
if pid_file_path.exists() {
|
||||
let pid_in_file = read_pidfile(pid_file_path)?;
|
||||
Ok(pid_in_file == pid)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
None => Ok(true),
|
||||
},
|
||||
Ok(false) => Ok(false),
|
||||
Err(e) => anyhow::bail!("process failed to start: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// We expect a file that contains a single integer.
|
||||
fn read_pidfile(pidfile: &Path) -> Result<Pid> {
|
||||
let pid_str = fs::read_to_string(pidfile)
|
||||
.with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
|
||||
let pid: i32 = pid_str
|
||||
.parse()
|
||||
.map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
|
||||
if pid < 1 {
|
||||
bail!("pidfile {pidfile:?} contained bad value '{pid}'");
|
||||
}
|
||||
Ok(Pid::from_raw(pid))
|
||||
}
|
||||
|
||||
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||
match kill(pid, None) {
|
||||
// Process exists, keep waiting
|
||||
Ok(_) => Ok(false),
|
||||
// Process not found, we're done
|
||||
Err(Errno::ESRCH) => Ok(true),
|
||||
Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"),
|
||||
}
|
||||
}
|
||||
@@ -9,8 +9,8 @@ use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use control_plane::compute::ComputeControlPlane;
|
||||
use control_plane::local_env::{EtcdBroker, LocalEnv};
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage::PageServerNode;
|
||||
use control_plane::{etcd, local_env};
|
||||
use pageserver_api::models::TimelineInfo;
|
||||
use pageserver_api::{
|
||||
|
||||
@@ -12,15 +12,14 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use utils::{
|
||||
connstring::connection_host_port,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
|
||||
use crate::pageserver::PageServerNode;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage::PageServerNode;
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
@@ -282,9 +281,7 @@ impl PostgresNode {
|
||||
fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
|
||||
let mut conf = PostgresConf::new();
|
||||
conf.append("max_wal_senders", "10");
|
||||
// wal_log_hints is mandatory when running against pageserver (see gh issue#192)
|
||||
// TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
|
||||
conf.append("wal_log_hints", "on");
|
||||
conf.append("wal_log_hints", "off");
|
||||
conf.append("max_replication_slots", "10");
|
||||
conf.append("hot_standby", "on");
|
||||
conf.append("shared_buffers", "1MB");
|
||||
@@ -302,7 +299,8 @@ impl PostgresNode {
|
||||
|
||||
// Configure the node to fetch pages from pageserver
|
||||
let pageserver_connstr = {
|
||||
let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
|
||||
let config = &self.pageserver.pg_connection_config;
|
||||
let (host, port) = (config.host(), config.port());
|
||||
|
||||
// Set up authentication
|
||||
//
|
||||
|
||||
57
control_plane/src/connection.rs
Normal file
57
control_plane/src/connection.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PgConnectionConfig {
|
||||
url: Url,
|
||||
}
|
||||
|
||||
impl PgConnectionConfig {
|
||||
pub fn host(&self) -> &str {
|
||||
self.url.host_str().expect("BUG: no host")
|
||||
}
|
||||
|
||||
pub fn port(&self) -> u16 {
|
||||
self.url.port().expect("BUG: no port")
|
||||
}
|
||||
|
||||
/// Return a `<host>:<port>` string.
|
||||
pub fn raw_address(&self) -> String {
|
||||
format!("{}:{}", self.host(), self.port())
|
||||
}
|
||||
|
||||
/// Connect using postgres protocol with TLS disabled.
|
||||
pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
|
||||
postgres::Client::connect(self.url.as_str(), postgres::NoTls)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for PgConnectionConfig {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let mut url: Url = s.parse()?;
|
||||
|
||||
match url.scheme() {
|
||||
"postgres" | "postgresql" => {}
|
||||
other => anyhow::bail!("invalid scheme: {other}"),
|
||||
}
|
||||
|
||||
// It's not a valid connection url if host is unavailable.
|
||||
if url.host().is_none() {
|
||||
anyhow::bail!(url::ParseError::EmptyHost);
|
||||
}
|
||||
|
||||
// E.g. `postgres:bar`.
|
||||
if url.cannot_be_a_base() {
|
||||
anyhow::bail!("URL cannot be a base");
|
||||
}
|
||||
|
||||
// Set the default PG port if it's missing.
|
||||
if url.port().is_none() {
|
||||
url.set_port(Some(5432))
|
||||
.expect("BUG: couldn't set the default port");
|
||||
}
|
||||
|
||||
Ok(Self { url })
|
||||
}
|
||||
}
|
||||
@@ -1,95 +1,75 @@
|
||||
use std::{
|
||||
fs,
|
||||
path::PathBuf,
|
||||
process::{Command, Stdio},
|
||||
};
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
use anyhow::Context;
|
||||
use nix::{
|
||||
sys::signal::{kill, Signal},
|
||||
unistd::Pid,
|
||||
};
|
||||
|
||||
use crate::{local_env, read_pidfile};
|
||||
use crate::{background_process, local_env};
|
||||
|
||||
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_broker = &env.etcd_broker;
|
||||
println!(
|
||||
"Starting etcd broker using {}",
|
||||
etcd_broker.etcd_binary_path.display()
|
||||
"Starting etcd broker using {:?}",
|
||||
etcd_broker.etcd_binary_path
|
||||
);
|
||||
|
||||
let etcd_data_dir = env.base_data_dir.join("etcd");
|
||||
fs::create_dir_all(&etcd_data_dir).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd data dir: {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
fs::create_dir_all(&etcd_data_dir)
|
||||
.with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;
|
||||
|
||||
let etcd_stdout_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd stout file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let etcd_stderr_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd stderr file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let client_urls = etcd_broker.comma_separated_endpoints();
|
||||
let args = [
|
||||
format!("--data-dir={}", etcd_data_dir.display()),
|
||||
format!("--listen-client-urls={client_urls}"),
|
||||
format!("--advertise-client-urls={client_urls}"),
|
||||
// Set --quota-backend-bytes to keep the etcd virtual memory
|
||||
// size smaller. Our test etcd clusters are very small.
|
||||
// See https://github.com/etcd-io/etcd/issues/7910
|
||||
"--quota-backend-bytes=100000000".to_string(),
|
||||
// etcd doesn't compact (vacuum) with default settings,
|
||||
// enable it to prevent space exhaustion.
|
||||
"--auto-compaction-mode=revision".to_string(),
|
||||
"--auto-compaction-retention=1".to_string(),
|
||||
];
|
||||
|
||||
let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
|
||||
.args(&[
|
||||
format!("--data-dir={}", etcd_data_dir.display()),
|
||||
format!("--listen-client-urls={client_urls}"),
|
||||
format!("--advertise-client-urls={client_urls}"),
|
||||
// Set --quota-backend-bytes to keep the etcd virtual memory
|
||||
// size smaller. Our test etcd clusters are very small.
|
||||
// See https://github.com/etcd-io/etcd/issues/7910
|
||||
"--quota-backend-bytes=100000000".to_string(),
|
||||
])
|
||||
.stdout(Stdio::from(etcd_stdout_file))
|
||||
.stderr(Stdio::from(etcd_stderr_file))
|
||||
.spawn()
|
||||
.context("Failed to spawn etcd subprocess")?;
|
||||
let pid = etcd_process.id();
|
||||
let pid_file_path = etcd_pid_file_path(env);
|
||||
|
||||
let etcd_pid_file_path = etcd_pid_file_path(env);
|
||||
fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd pid file at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?;
|
||||
let client = reqwest::blocking::Client::new();
|
||||
|
||||
background_process::start_process(
|
||||
"etcd",
|
||||
&etcd_data_dir,
|
||||
&etcd_broker.etcd_binary_path,
|
||||
&args,
|
||||
background_process::InitialPidFile::Create(&pid_file_path),
|
||||
|| {
|
||||
for broker_endpoint in &etcd_broker.broker_endpoints {
|
||||
let request = broker_endpoint
|
||||
.join("health")
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to append /health path to broker endopint {}",
|
||||
broker_endpoint
|
||||
)
|
||||
})
|
||||
.and_then(|url| {
|
||||
client.get(&url.to_string()).build().with_context(|| {
|
||||
format!("Failed to construct request to etcd endpoint {url}")
|
||||
})
|
||||
})?;
|
||||
if client.execute(request).is_ok() {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
},
|
||||
)
|
||||
.context("Failed to spawn etcd subprocess")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_path = &env.etcd_broker.etcd_binary_path;
|
||||
println!("Stopping etcd broker at {}", etcd_path.display());
|
||||
|
||||
let etcd_pid_file_path = etcd_pid_file_path(env);
|
||||
let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read etcd pid file at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?);
|
||||
|
||||
kill(pid, Signal::SIGTERM).with_context(|| {
|
||||
format!(
|
||||
"Failed to stop etcd with pid {pid} at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
|
||||
}
|
||||
|
||||
fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
|
||||
|
||||
@@ -6,59 +6,12 @@
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
|
||||
mod background_process;
|
||||
pub mod compute;
|
||||
pub mod connection;
|
||||
pub mod etcd;
|
||||
pub mod local_env;
|
||||
pub mod pageserver;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
pub mod storage;
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// We expect a file that contains a single integer.
|
||||
/// We return an i32 for compatibility with libc and nix.
|
||||
pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
|
||||
let pid_str = fs::read_to_string(pidfile)
|
||||
.with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
|
||||
let pid: i32 = pid_str
|
||||
.parse()
|
||||
.map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
|
||||
if pid < 1 {
|
||||
bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
|
||||
}
|
||||
Ok(pid)
|
||||
}
|
||||
|
||||
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
|
||||
|
||||
let var = "LLVM_PROFILE_FILE";
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
cmd.env(var, val);
|
||||
}
|
||||
|
||||
const RUST_LOG_KEY: &str = "RUST_LOG";
|
||||
if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
|
||||
cmd.env(RUST_LOG_KEY, rust_log_value)
|
||||
} else {
|
||||
cmd
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in [
|
||||
"AWS_ACCESS_KEY_ID",
|
||||
"AWS_SECRET_ACCESS_KEY",
|
||||
"AWS_SESSION_TOKEN",
|
||||
] {
|
||||
if let Ok(value) = std::env::var(env_key) {
|
||||
cmd = cmd.env(env_key, value);
|
||||
}
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
@@ -226,12 +226,12 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
|
||||
Ok(self.neon_distrib_dir.join("pageserver"))
|
||||
pub fn pageserver_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("pageserver")
|
||||
}
|
||||
|
||||
pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
|
||||
Ok(self.neon_distrib_dir.join("safekeeper"))
|
||||
pub fn safekeeper_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("safekeeper")
|
||||
}
|
||||
|
||||
pub fn pg_data_dirs_path(&self) -> PathBuf {
|
||||
|
||||
@@ -1,33 +1,27 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{BufReader, Write};
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
use std::{io, result, thread};
|
||||
use std::process::Child;
|
||||
use std::{io, result};
|
||||
|
||||
use crate::connection::PgConnectionConfig;
|
||||
use anyhow::{bail, Context};
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use postgres::{Config, NoTls};
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverHttpError {
|
||||
@@ -75,7 +69,7 @@ impl ResponseErrorMessageExt for Response {
|
||||
//
|
||||
#[derive(Debug)]
|
||||
pub struct PageServerNode {
|
||||
pub pg_connection_config: Config,
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: Client,
|
||||
pub http_base_url: String,
|
||||
@@ -101,7 +95,7 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
/// Construct libpq connection string for connecting to the pageserver.
|
||||
fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
|
||||
fn pageserver_connection_config(password: &str, listen_addr: &str) -> PgConnectionConfig {
|
||||
format!("postgresql://no_user:{password}@{listen_addr}/no_db")
|
||||
.parse()
|
||||
.unwrap()
|
||||
@@ -161,7 +155,15 @@ impl PageServerNode {
|
||||
init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
|
||||
}
|
||||
|
||||
self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?;
|
||||
let mut pageserver_process = self
|
||||
.start_node(&init_config_overrides, &self.env.base_data_dir, true)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to start a process for pageserver {}",
|
||||
self.env.pageserver.id,
|
||||
)
|
||||
})?;
|
||||
|
||||
let init_result = self
|
||||
.try_init_timeline(create_tenant, initial_timeline_id, pg_version)
|
||||
.context("Failed to create initial tenant and timeline for pageserver");
|
||||
@@ -171,7 +173,29 @@ impl PageServerNode {
|
||||
}
|
||||
Err(e) => eprintln!("{e:#}"),
|
||||
}
|
||||
self.stop(false)?;
|
||||
match pageserver_process.kill() {
|
||||
Err(e) => {
|
||||
eprintln!(
|
||||
"Failed to stop pageserver {} process with pid {}: {e:#}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
)
|
||||
}
|
||||
Ok(()) => {
|
||||
println!(
|
||||
"Stopped pageserver {} process with pid {}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
);
|
||||
// cleanup after pageserver startup, since we do not call regular `stop_process` during init
|
||||
let pid_file = self.pid_file();
|
||||
if let Err(e) = fs::remove_file(&pid_file) {
|
||||
if e.kind() != io::ErrorKind::NotFound {
|
||||
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
init_result
|
||||
}
|
||||
|
||||
@@ -196,11 +220,14 @@ impl PageServerNode {
|
||||
self.env.pageserver_data_dir()
|
||||
}
|
||||
|
||||
pub fn pid_file(&self) -> PathBuf {
|
||||
/// The pid file is created by the pageserver process, with its pid stored inside.
|
||||
/// Other pageservers cannot lock the same file and overwrite it for as long as the current
|
||||
/// pageserver runs. (Unless someone removes the file manually; never do that!)
|
||||
fn pid_file(&self) -> PathBuf {
|
||||
self.repo_path().join("pageserver.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
|
||||
self.start_node(config_overrides, &self.repo_path(), false)
|
||||
}
|
||||
|
||||
@@ -209,10 +236,10 @@ impl PageServerNode {
|
||||
config_overrides: &[&str],
|
||||
datadir: &Path,
|
||||
update_config: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<Child> {
|
||||
println!(
|
||||
"Starting pageserver at '{}' in '{}'",
|
||||
connection_address(&self.pg_connection_config),
|
||||
self.pg_connection_config.raw_address(),
|
||||
datadir.display()
|
||||
);
|
||||
io::stdout().flush()?;
|
||||
@@ -220,10 +247,7 @@ impl PageServerNode {
|
||||
let mut args = vec![
|
||||
"-D",
|
||||
datadir.to_str().with_context(|| {
|
||||
format!(
|
||||
"Datadir path '{}' cannot be represented as a unicode string",
|
||||
datadir.display()
|
||||
)
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
];
|
||||
|
||||
@@ -235,48 +259,18 @@ impl PageServerNode {
|
||||
args.extend(["-c", config_override]);
|
||||
}
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
|
||||
filled_cmd = fill_aws_secrets_vars(filled_cmd);
|
||||
|
||||
if !filled_cmd.status()?.success() {
|
||||
bail!(
|
||||
"Pageserver failed to start. See console output and '{}' for details.",
|
||||
datadir.join("pageserver.log").display()
|
||||
);
|
||||
}
|
||||
|
||||
// It takes a while for the page server to start up. Wait until it is
|
||||
// open for business.
|
||||
const RETRIES: i8 = 15;
|
||||
for retries in 1..RETRIES {
|
||||
match self.check_status() {
|
||||
Ok(()) => {
|
||||
println!("\nPageserver started");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => {
|
||||
match err {
|
||||
PageserverHttpError::Transport(err) => {
|
||||
if err.is_connect() && retries < 5 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
} else {
|
||||
if retries == 5 {
|
||||
println!() // put a line break after dots for second message
|
||||
}
|
||||
println!("Pageserver not responding yet, err {err} retrying ({retries})...");
|
||||
}
|
||||
}
|
||||
PageserverHttpError::Response(msg) => {
|
||||
bail!("pageserver failed to start: {msg} ")
|
||||
}
|
||||
}
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("pageserver failed to start in {RETRIES} seconds");
|
||||
background_process::start_process(
|
||||
"pageserver",
|
||||
datadir,
|
||||
&self.env.pageserver_bin(),
|
||||
&args,
|
||||
background_process::InitialPidFile::Expect(&self.pid_file()),
|
||||
|| match self.check_status() {
|
||||
Ok(()) => Ok(true),
|
||||
Err(PageserverHttpError::Transport(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -288,69 +282,18 @@ impl PageServerNode {
|
||||
/// If the server is not running, returns success
|
||||
///
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
let pid_file = self.pid_file();
|
||||
if !pid_file.exists() {
|
||||
println!("Pageserver is already stopped");
|
||||
return Ok(());
|
||||
}
|
||||
let pid = Pid::from_raw(read_pidfile(&pid_file)?);
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping pageserver immediately..");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping pageserver gracefully..");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!("Pageserver with pid {pid} does not exist, but a PID file was found");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to pageserver with pid {pid}: {}",
|
||||
err.desc()
|
||||
),
|
||||
}
|
||||
|
||||
// Wait until process is gone
|
||||
for i in 0..600 {
|
||||
let signal = None; // Send no signal, just get the error code
|
||||
match kill(pid, signal) {
|
||||
Ok(_) => (), // Process exists, keep waiting
|
||||
Err(Errno::ESRCH) => {
|
||||
// Process not found, we're done
|
||||
println!("done!");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to pageserver with pid {}: {}",
|
||||
pid,
|
||||
err.desc()
|
||||
),
|
||||
};
|
||||
|
||||
if i % 10 == 0 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {pid}");
|
||||
background_process::stop_process(immediate, "pageserver", &self.pid_file())
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
|
||||
let mut client = self.pg_connection_config.connect_no_tls().unwrap();
|
||||
|
||||
println!("Pageserver query: '{sql}'");
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
|
||||
pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
|
||||
self.pg_connection_config.connect(NoTls)
|
||||
self.pg_connection_config.connect_no_tls()
|
||||
}
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
@@ -549,7 +492,7 @@ impl PageServerNode {
|
||||
pg_wal: Option<(Lsn, PathBuf)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
|
||||
let mut client = self.pg_connection_config.connect_no_tls().unwrap();
|
||||
|
||||
// Init base reader
|
||||
let (start_lsn, base_tarfile_path) = base;
|
||||
@@ -1,23 +1,21 @@
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::process::Child;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{io, result, thread};
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::bail;
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use postgres::Config;
|
||||
use anyhow::Context;
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};
|
||||
use utils::{http::error::HttpErrorBody, id::NodeId};
|
||||
|
||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||
use crate::storage::PageServerNode;
|
||||
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
|
||||
use crate::connection::PgConnectionConfig;
|
||||
use crate::pageserver::PageServerNode;
|
||||
use crate::{
|
||||
background_process,
|
||||
local_env::{LocalEnv, SafekeeperConf},
|
||||
};
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SafekeeperHttpError {
|
||||
@@ -63,7 +61,7 @@ pub struct SafekeeperNode {
|
||||
|
||||
pub conf: SafekeeperConf,
|
||||
|
||||
pub pg_connection_config: Config,
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: Client,
|
||||
pub http_base_url: String,
|
||||
@@ -87,15 +85,15 @@ impl SafekeeperNode {
|
||||
}
|
||||
|
||||
/// Construct libpq connection string for connecting to this safekeeper.
|
||||
fn safekeeper_connection_config(port: u16) -> Config {
|
||||
fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
|
||||
// TODO safekeeper authentication not implemented yet
|
||||
format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
|
||||
format!("postgresql://no_user@127.0.0.1:{port}/no_db")
|
||||
.parse()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
|
||||
env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
|
||||
env.safekeeper_data_dir(&format!("sk{sk_id}"))
|
||||
}
|
||||
|
||||
pub fn datadir_path(&self) -> PathBuf {
|
||||
@@ -106,92 +104,78 @@ impl SafekeeperNode {
|
||||
self.datadir_path().join("safekeeper.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> anyhow::Result<()> {
|
||||
pub fn start(&self) -> anyhow::Result<Child> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
connection_address(&self.pg_connection_config),
|
||||
self.pg_connection_config.raw_address(),
|
||||
self.datadir_path().display()
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
|
||||
let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
|
||||
let id = self.id;
|
||||
let datadir = self.datadir_path();
|
||||
|
||||
let mut cmd = Command::new(self.env.safekeeper_bin()?);
|
||||
fill_rust_env_vars(
|
||||
cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
|
||||
.args(&["--id", self.id.to_string().as_ref()])
|
||||
.args(&["--listen-pg", &listen_pg])
|
||||
.args(&["--listen-http", &listen_http])
|
||||
.args(&["--recall", "1 second"])
|
||||
.arg("--daemonize"),
|
||||
);
|
||||
let id_string = id.to_string();
|
||||
let mut args = vec![
|
||||
"-D",
|
||||
datadir.to_str().with_context(|| {
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
"--id",
|
||||
&id_string,
|
||||
"--listen-pg",
|
||||
&listen_pg,
|
||||
"--listen-http",
|
||||
&listen_http,
|
||||
];
|
||||
if !self.conf.sync {
|
||||
cmd.arg("--no-sync");
|
||||
args.push("--no-sync");
|
||||
}
|
||||
|
||||
let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
|
||||
if !comma_separated_endpoints.is_empty() {
|
||||
cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
|
||||
args.extend(["--broker-endpoints", &comma_separated_endpoints]);
|
||||
}
|
||||
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
|
||||
cmd.args(&["--broker-etcd-prefix", prefix]);
|
||||
args.extend(["--broker-etcd-prefix", prefix]);
|
||||
}
|
||||
|
||||
let mut backup_threads = String::new();
|
||||
if let Some(threads) = self.conf.backup_threads {
|
||||
cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
|
||||
backup_threads = threads.to_string();
|
||||
args.extend(["--backup-threads", &backup_threads]);
|
||||
} else {
|
||||
drop(backup_threads);
|
||||
}
|
||||
|
||||
if let Some(ref remote_storage) = self.conf.remote_storage {
|
||||
cmd.args(&["--remote-storage", remote_storage]);
|
||||
args.extend(["--remote-storage", remote_storage]);
|
||||
}
|
||||
|
||||
let key_path = self.env.base_data_dir.join("auth_public_key.pem");
|
||||
if self.conf.auth_enabled {
|
||||
cmd.arg("--auth-validation-public-key-path");
|
||||
// PathBuf is better be passed as is, not via `String`.
|
||||
cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
|
||||
args.extend([
|
||||
"--auth-validation-public-key-path",
|
||||
key_path.to_str().with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
]);
|
||||
}
|
||||
|
||||
fill_aws_secrets_vars(&mut cmd);
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Safekeeper failed to start. See '{}' for details.",
|
||||
self.datadir_path().join("safekeeper.log").display()
|
||||
);
|
||||
}
|
||||
|
||||
// It takes a while for the safekeeper to start up. Wait until it is
|
||||
// open for business.
|
||||
const RETRIES: i8 = 15;
|
||||
for retries in 1..RETRIES {
|
||||
match self.check_status() {
|
||||
Ok(_) => {
|
||||
println!("\nSafekeeper started");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => {
|
||||
match err {
|
||||
SafekeeperHttpError::Transport(err) => {
|
||||
if err.is_connect() && retries < 5 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
} else {
|
||||
if retries == 5 {
|
||||
println!() // put a line break after dots for second message
|
||||
}
|
||||
println!(
|
||||
"Safekeeper not responding yet, err {} retrying ({})...",
|
||||
err, retries
|
||||
);
|
||||
}
|
||||
}
|
||||
SafekeeperHttpError::Response(msg) => {
|
||||
bail!("safekeeper failed to start: {} ", msg)
|
||||
}
|
||||
}
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("safekeeper failed to start in {} seconds", RETRIES);
|
||||
background_process::start_process(
|
||||
&format!("safekeeper {id}"),
|
||||
&datadir,
|
||||
&self.env.safekeeper_bin(),
|
||||
&args,
|
||||
background_process::InitialPidFile::Expect(&self.pid_file()),
|
||||
|| match self.check_status() {
|
||||
Ok(()) => Ok(true),
|
||||
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -203,63 +187,11 @@ impl SafekeeperNode {
|
||||
/// If the server is not running, returns success
|
||||
///
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
let pid_file = self.pid_file();
|
||||
if !pid_file.exists() {
|
||||
println!("Safekeeper {} is already stopped", self.id);
|
||||
return Ok(());
|
||||
}
|
||||
let pid = read_pidfile(&pid_file)?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping safekeeper {} immediately..", self.id);
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping safekeeper {} gracefully..", self.id);
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!(
|
||||
"Safekeeper with pid {} does not exist, but a PID file was found",
|
||||
pid
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to safekeeper with pid {}: {}",
|
||||
pid,
|
||||
err.desc()
|
||||
),
|
||||
}
|
||||
|
||||
// Wait until process is gone
|
||||
for i in 0..600 {
|
||||
let signal = None; // Send no signal, just get the error code
|
||||
match kill(pid, signal) {
|
||||
Ok(_) => (), // Process exists, keep waiting
|
||||
Err(Errno::ESRCH) => {
|
||||
// Process not found, we're done
|
||||
println!("done!");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to pageserver with pid {}: {}",
|
||||
pid,
|
||||
err.desc()
|
||||
),
|
||||
};
|
||||
|
||||
if i % 10 == 0 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
|
||||
bail!("Failed to stop safekeeper with pid {}", pid);
|
||||
background_process::stop_process(
|
||||
immediate,
|
||||
&format!("safekeeper {}", self.id),
|
||||
&self.pid_file(),
|
||||
)
|
||||
}
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
|
||||
48
docker-compose/compute/shell/compute.sh
Executable file
48
docker-compose/compute/shell/compute.sh
Executable file
@@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
set -eux
|
||||
|
||||
PG_VERSION=${PG_VERSION:-14}
|
||||
|
||||
SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
|
||||
SPEC_FILE=/tmp/spec.json
|
||||
|
||||
echo "Waiting pageserver become ready."
|
||||
while ! nc -z pageserver 6400; do
|
||||
sleep 1;
|
||||
done
|
||||
echo "Page server is ready."
|
||||
|
||||
echo "Create a tenant and timeline"
|
||||
PARAMS=(
|
||||
-sb
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{}"
|
||||
http://pageserver:9898/v1/tenant/
|
||||
)
|
||||
tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
|
||||
|
||||
PARAMS=(
|
||||
-sb
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
|
||||
echo "Overwrite tenant id and timeline id in spec file"
|
||||
tenant_id=$(echo ${result} | jq -r .tenant_id)
|
||||
timeline_id=$(echo ${result} | jq -r .timeline_id)
|
||||
|
||||
sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
|
||||
|
||||
cat ${SPEC_FILE}
|
||||
|
||||
echo "Start compute node"
|
||||
/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
|
||||
-C "postgresql://cloud_admin@localhost:55433/postgres" \
|
||||
-b /usr/local/bin/postgres \
|
||||
-S ${SPEC_FILE}
|
||||
141
docker-compose/compute/var/db/postgres/specs/spec.json
Normal file
141
docker-compose/compute/var/db/postgres/specs/spec.json
Normal file
@@ -0,0 +1,141 @@
|
||||
{
|
||||
"format_version": 1.0,
|
||||
|
||||
"timestamp": "2022-10-12T18:00:00.000Z",
|
||||
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
|
||||
|
||||
"cluster": {
|
||||
"cluster_id": "docker_compose",
|
||||
"name": "docker_compose_test",
|
||||
"state": "restarted",
|
||||
"roles": [
|
||||
{
|
||||
"name": "cloud_admin",
|
||||
"encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
|
||||
"options": null
|
||||
}
|
||||
],
|
||||
"databases": [
|
||||
],
|
||||
"settings": [
|
||||
{
|
||||
"name": "fsync",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "wal_level",
|
||||
"value": "replica",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "hot_standby",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "wal_log_hints",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "log_connections",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "port",
|
||||
"value": "55433",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "shared_buffers",
|
||||
"value": "1MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_connections",
|
||||
"value": "100",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "listen_addresses",
|
||||
"value": "0.0.0.0",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_wal_senders",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_slots",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "wal_sender_timeout",
|
||||
"value": "5s",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "wal_keep_size",
|
||||
"value": "0",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "password_encryption",
|
||||
"value": "md5",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "restart_after_crash",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "synchronous_standby_names",
|
||||
"value": "walproposer",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "shared_preload_libraries",
|
||||
"value": "neon",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.timeline_id",
|
||||
"value": "TIMELINE_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.tenant_id",
|
||||
"value": "TENANT_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.pageserver_connstring",
|
||||
"value": "host=pageserver port=6400",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_write_lag",
|
||||
"value": "500MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_flush_lag",
|
||||
"value": "10GB",
|
||||
"vartype": "string"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"delta_operations": [
|
||||
]
|
||||
}
|
||||
200
docker-compose/docker-compose.yml
Normal file
200
docker-compose/docker-compose.yml
Normal file
@@ -0,0 +1,200 @@
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
etcd:
|
||||
image: quay.io/coreos/etcd:v3.5.4
|
||||
ports:
|
||||
- 2379:2379
|
||||
- 2380:2380
|
||||
environment:
|
||||
# This signifficantly speeds up etcd and we anyway don't data persistency there.
|
||||
ETCD_UNSAFE_NO_FSYNC: "1"
|
||||
command:
|
||||
- "etcd"
|
||||
- "--auto-compaction-mode=revision"
|
||||
- "--auto-compaction-retention=1"
|
||||
- "--name=etcd-cluster"
|
||||
- "--initial-cluster-state=new"
|
||||
- "--initial-cluster-token=etcd-cluster-1"
|
||||
- "--initial-cluster=etcd-cluster=http://etcd:2380"
|
||||
- "--initial-advertise-peer-urls=http://etcd:2380"
|
||||
- "--advertise-client-urls=http://etcd:2379"
|
||||
- "--listen-client-urls=http://0.0.0.0:2379"
|
||||
- "--listen-peer-urls=http://0.0.0.0:2380"
|
||||
- "--quota-backend-bytes=134217728" # 128 MB
|
||||
|
||||
minio:
|
||||
image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
|
||||
ports:
|
||||
- 9000:9000
|
||||
- 9001:9001
|
||||
environment:
|
||||
- MINIO_ROOT_USER=minio
|
||||
- MINIO_ROOT_PASSWORD=password
|
||||
command: server /data --address :9000 --console-address ":9001"
|
||||
|
||||
minio_create_buckets:
|
||||
image: minio/mc
|
||||
environment:
|
||||
- MINIO_ROOT_USER=minio
|
||||
- MINIO_ROOT_PASSWORD=password
|
||||
entrypoint:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
command:
|
||||
- "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
|
||||
echo 'Waiting to start minio...' && sleep 1;
|
||||
done;
|
||||
/usr/bin/mc mb minio/neon --region=eu-north-1;
|
||||
exit 0;"
|
||||
depends_on:
|
||||
- minio
|
||||
|
||||
pageserver:
|
||||
image: neondatabase/neon:${TAG:-latest}
|
||||
environment:
|
||||
- BROKER_ENDPOINT='http://etcd:2379'
|
||||
- AWS_ACCESS_KEY_ID=minio
|
||||
- AWS_SECRET_ACCESS_KEY=password
|
||||
#- RUST_BACKTRACE=1
|
||||
ports:
|
||||
#- 6400:6400 # pg protocol handler
|
||||
- 9898:9898 # http endpoints
|
||||
entrypoint:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
command:
|
||||
- "/usr/local/bin/pageserver -D /data/.neon/
|
||||
-c \"broker_endpoints=[$$BROKER_ENDPOINT]\"
|
||||
-c \"listen_pg_addr='0.0.0.0:6400'\"
|
||||
-c \"listen_http_addr='0.0.0.0:9898'\"
|
||||
-c \"remote_storage={endpoint='http://minio:9000',
|
||||
bucket_name='neon',
|
||||
bucket_region='eu-north-1',
|
||||
prefix_in_bucket='/pageserver/'}\""
|
||||
depends_on:
|
||||
- etcd
|
||||
- minio_create_buckets
|
||||
|
||||
safekeeper1:
|
||||
image: neondatabase/neon:${TAG:-latest}
|
||||
environment:
|
||||
- SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
|
||||
- SAFEKEEPER_ID=1
|
||||
- BROKER_ENDPOINT=http://etcd:2379
|
||||
- AWS_ACCESS_KEY_ID=minio
|
||||
- AWS_SECRET_ACCESS_KEY=password
|
||||
#- RUST_BACKTRACE=1
|
||||
ports:
|
||||
#- 5454:5454 # pg protocol handler
|
||||
- 7676:7676 # http endpoints
|
||||
entrypoint:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
command:
|
||||
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
|
||||
--listen-http='0.0.0.0:7676'
|
||||
--id=$$SAFEKEEPER_ID
|
||||
--broker-endpoints=$$BROKER_ENDPOINT
|
||||
-D /data
|
||||
--remote-storage=\"{endpoint='http://minio:9000',
|
||||
bucket_name='neon',
|
||||
bucket_region='eu-north-1',
|
||||
prefix_in_bucket='/safekeeper/'}\""
|
||||
depends_on:
|
||||
- etcd
|
||||
- minio_create_buckets
|
||||
|
||||
safekeeper2:
|
||||
image: neondatabase/neon:${TAG:-latest}
|
||||
environment:
|
||||
- SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
|
||||
- SAFEKEEPER_ID=2
|
||||
- BROKER_ENDPOINT=http://etcd:2379
|
||||
- AWS_ACCESS_KEY_ID=minio
|
||||
- AWS_SECRET_ACCESS_KEY=password
|
||||
#- RUST_BACKTRACE=1
|
||||
ports:
|
||||
#- 5454:5454 # pg protocol handler
|
||||
- 7677:7676 # http endpoints
|
||||
entrypoint:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
command:
|
||||
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
|
||||
--listen-http='0.0.0.0:7676'
|
||||
--id=$$SAFEKEEPER_ID
|
||||
--broker-endpoints=$$BROKER_ENDPOINT
|
||||
-D /data
|
||||
--remote-storage=\"{endpoint='http://minio:9000',
|
||||
bucket_name='neon',
|
||||
bucket_region='eu-north-1',
|
||||
prefix_in_bucket='/safekeeper/'}\""
|
||||
depends_on:
|
||||
- etcd
|
||||
- minio_create_buckets
|
||||
|
||||
safekeeper3:
|
||||
image: neondatabase/neon:${TAG:-latest}
|
||||
environment:
|
||||
- SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
|
||||
- SAFEKEEPER_ID=3
|
||||
- BROKER_ENDPOINT=http://etcd:2379
|
||||
- AWS_ACCESS_KEY_ID=minio
|
||||
- AWS_SECRET_ACCESS_KEY=password
|
||||
#- RUST_BACKTRACE=1
|
||||
ports:
|
||||
#- 5454:5454 # pg protocol handler
|
||||
- 7678:7676 # http endpoints
|
||||
entrypoint:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
command:
|
||||
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
|
||||
--listen-http='0.0.0.0:7676'
|
||||
--id=$$SAFEKEEPER_ID
|
||||
--broker-endpoints=$$BROKER_ENDPOINT
|
||||
-D /data
|
||||
--remote-storage=\"{endpoint='http://minio:9000',
|
||||
bucket_name='neon',
|
||||
bucket_region='eu-north-1',
|
||||
prefix_in_bucket='/safekeeper/'}\""
|
||||
depends_on:
|
||||
- etcd
|
||||
- minio_create_buckets
|
||||
|
||||
compute:
|
||||
build:
|
||||
context: ./image/compute
|
||||
args:
|
||||
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
|
||||
- http_proxy=$http_proxy
|
||||
- https_proxy=$https_proxy
|
||||
environment:
|
||||
- PG_VERSION=${PG_VERSION:-14}
|
||||
#- RUST_BACKTRACE=1
|
||||
volumes:
|
||||
- ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
|
||||
- ./compute/shell/:/shell/
|
||||
ports:
|
||||
- 55433:55433 # pg protocol handler
|
||||
- 3080:3080 # http endpoints
|
||||
entrypoint:
|
||||
- "/shell/compute.sh"
|
||||
depends_on:
|
||||
- safekeeper1
|
||||
- safekeeper2
|
||||
- safekeeper3
|
||||
- pageserver
|
||||
|
||||
compute_is_ready:
|
||||
image: postgres:latest
|
||||
entrypoint:
|
||||
- "/bin/bash"
|
||||
- "-c"
|
||||
command:
|
||||
- "until pg_isready -h compute -p 55433 ; do
|
||||
echo 'Waiting to start compute...' && sleep 1;
|
||||
done"
|
||||
depends_on:
|
||||
- compute
|
||||
10
docker-compose/image/compute/Dockerfile
Normal file
10
docker-compose/image/compute/Dockerfile
Normal file
@@ -0,0 +1,10 @@
|
||||
ARG COMPUTE_IMAGE=compute-node-v14:latest
|
||||
FROM neondatabase/${COMPUTE_IMAGE}
|
||||
|
||||
USER root
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl \
|
||||
jq \
|
||||
netcat
|
||||
|
||||
USER postgres
|
||||
@@ -18,3 +18,67 @@ We build all images after a successful `release` tests run and push automaticall
|
||||
1. `neondatabase/compute-tools` and `neondatabase/compute-node`
|
||||
|
||||
2. `neondatabase/neon`
|
||||
|
||||
## Docker Compose example
|
||||
|
||||
You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
|
||||
|
||||
- etcd x 1
|
||||
- pageserver x 1
|
||||
- safekeeper x 3
|
||||
- compute x 1
|
||||
- MinIO x 1 # This is Amazon S3 compatible object storage
|
||||
|
||||
### How to use
|
||||
|
||||
1. create containers
|
||||
|
||||
You can specify version of neon cluster using following environment values.
|
||||
- PG_VERSION: postgres version for compute (default is 14)
|
||||
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
|
||||
```
|
||||
$ cd docker-compose/docker-compose.yml
|
||||
$ docker-compose down # remove the conainers if exists
|
||||
$ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version
|
||||
Creating network "dockercompose_default" with the default driver
|
||||
Creating dockercompose_etcd3_1 ...
|
||||
(...omit...)
|
||||
```
|
||||
|
||||
2. connect compute node
|
||||
```
|
||||
$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
|
||||
$ psql -h localhost -p 55433 -U cloud_admin
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
CREATE TABLE
|
||||
postgres=# insert into t values(1,1);
|
||||
INSERT 0 1
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
```
|
||||
|
||||
3. If you want to see the log, you can use `docker-compose logs` command.
|
||||
```
|
||||
# check the container name you want to see
|
||||
$ docker ps
|
||||
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
|
||||
d6968a5ae912 dockercompose_compute "/shell/compute.sh" 5 minutes ago Up 5 minutes 0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp dockercompose_compute_1
|
||||
(...omit...)
|
||||
|
||||
$ docker logs -f dockercompose_compute_1
|
||||
2022-10-21 06:15:48.757 GMT [56] LOG: connection authorized: user=cloud_admin database=postgres application_name=psql
|
||||
2022-10-21 06:17:00.307 GMT [56] LOG: [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
|
||||
(...omit...)
|
||||
```
|
||||
|
||||
4. If you want to see durable data in MinIO which is s3 compatible storage
|
||||
|
||||
Access http://localhost:9001 and sign in.
|
||||
|
||||
- Username: `minio`
|
||||
- Password: `password`
|
||||
|
||||
You can see durable pages and WAL data in `neon` bucket.
|
||||
246
docs/rfcs/020-pageserver-s3-coordination.md
Normal file
246
docs/rfcs/020-pageserver-s3-coordination.md
Normal file
@@ -0,0 +1,246 @@
|
||||
# Coordinating access of multiple pageservers to the same s3 data
|
||||
|
||||
## Motivation
|
||||
|
||||
There are some blind spots around coordinating access of multiple pageservers
|
||||
to the same s3 data. Currently this is applicable only to tenant relocation
|
||||
case, but in the future we'll need to solve similar problems for
|
||||
replica/standby pageservers.
|
||||
|
||||
## Impacted components (e.g. pageserver, safekeeper, console, etc)
|
||||
|
||||
Pageserver
|
||||
|
||||
## The problem
|
||||
|
||||
### Relocation
|
||||
|
||||
During relocation both pageservers can write to s3. This should be ok for all
|
||||
data except the `index_part.json`. For index part it causes problems during
|
||||
compaction/gc because they remove files from index/s3.
|
||||
|
||||
Imagine this case:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant PS1
|
||||
participant S3
|
||||
participant PS2
|
||||
|
||||
PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
|
||||
PS2->>S3: Attach called, sees L1, L2
|
||||
PS1->>S3: Compaction comes <br/> Removes L1, adds L3
|
||||
note over S3: Index now L2, L3
|
||||
PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
|
||||
note over S3: Index now L1, L2, L4
|
||||
```
|
||||
|
||||
At this point it is not possible to restore from index, it contains L2 which
|
||||
is no longer available in s3 and doesnt contain L3 added by compaction by the
|
||||
first pageserver. So if any of the pageservers restart initial sync will fail
|
||||
(or in on-demand world it will fail a bit later during page request from
|
||||
missing layer)
|
||||
|
||||
### Standby pageserver
|
||||
|
||||
Another related case is standby pageserver. In this case second pageserver can
|
||||
be used as a replica to scale reads and serve as a failover target in case
|
||||
first one fails.
|
||||
|
||||
In this mode second pageserver needs to have the same picture of s3 files to
|
||||
be able to load layers on-demand. To accomplish that second pageserver
|
||||
cannot run gc/compaction jobs. Instead it needs to receive updates for index
|
||||
contents. (There is no need to run walreceiver on the second pageserver then).
|
||||
|
||||
## Observations
|
||||
|
||||
- If both pageservers ingest wal then their layer set diverges, because layer
|
||||
file generation is not deterministic
|
||||
- If one of the pageservers does not ingest wal (and just picks up layer
|
||||
updates) then it lags behind and cannot really answer queries in the same
|
||||
pace as the primary one
|
||||
- Can compaction help make layers deterministic? E g we do not upload level
|
||||
zero layers and construction of higher levels should be deterministic.
|
||||
This way we can guarantee that layer creation by timeout wont mess things up.
|
||||
This way one pageserver uploads data and second one can just ingest it.
|
||||
But we still need some form of election
|
||||
|
||||
## Solutions
|
||||
|
||||
### Manual orchestration
|
||||
|
||||
One possible solution for relocation case is to orchestrate background jobs
|
||||
from outside. The oracle who runs migration can turn off background jobs on
|
||||
PS1 before migration and then run migration -> enable them on PS2. The problem
|
||||
comes if migration fails. In this case in order to resume background jobs
|
||||
oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
|
||||
respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
|
||||
without human ensuring that no upload from PS2 can happen. In order to be able
|
||||
to resolve this automatically CAS is required on S3 side so pageserver can
|
||||
avoid overwriting index part if it is no longer the leading one
|
||||
|
||||
Note that flag that disables background jobs needs to be persistent, because
|
||||
otherwise pageserver restart will clean it
|
||||
|
||||
### Avoid index_part.json
|
||||
|
||||
Index part consists of two parts, list of layers and metadata. List of layers
|
||||
can be easily obtained by `ListObjects` S3 API method. But what to do with
|
||||
metadata? Create metadata instance for each checkpoint and add some counter
|
||||
to the file name?
|
||||
|
||||
Back to potentially long s3 ls.
|
||||
|
||||
### Coordination based approach
|
||||
|
||||
Do it like safekeepers chose leader for WAL upload. Ping each other and decide
|
||||
based on some heuristics e g smallest node id. During relocation PS1 sends
|
||||
"resign" ping message so others can start election without waiting for a timeout.
|
||||
|
||||
This still leaves metadata question open and non deterministic layers are a
|
||||
problem as well
|
||||
|
||||
### Avoid metadata file
|
||||
|
||||
One way to eliminate metadata file is to store it in layer files under some
|
||||
special key. This may resonate with intention to keep all relation sizes in
|
||||
some special segment to avoid initial download during size calculation.
|
||||
Maybe with that we can even store pre calculated value.
|
||||
|
||||
As a downside each checkpoint gets 512 bytes larger.
|
||||
|
||||
If we entirely avoid metadata file this opens up many approaches
|
||||
|
||||
* * *
|
||||
|
||||
During discussion it seems that we converged on the approach consisting of:
|
||||
|
||||
- index files stored per pageserver in the same timeline directory. With that
|
||||
index file name starts to look like: `<pageserver_node_id>_index_part.json`.
|
||||
In such set up there are no concurrent overwrites of index file by different
|
||||
pageservers.
|
||||
- For replica pageservers the solution would be for primary to broadcast index
|
||||
changes to any followers with an ability to check index files in s3 and
|
||||
restore the full state. To properly merge changes with index files we can use
|
||||
a counter that is persisted in an index file, is incremented on every change
|
||||
to it and passed along with broadcasted change. This way we can determine
|
||||
whether we need to apply change to the index state or not.
|
||||
- Responsibility for running background jobs is assigned externally. Pageserver
|
||||
keeps locally persistent flag for each tenant that indicates whether this
|
||||
pageserver is considered as primary one or not. TODO what happends if we
|
||||
crash and cannot start for some extended period of time? Control plane can
|
||||
assign ownership to some other pageserver. Pageserver needs some way to check
|
||||
if its still the blessed one. Maybe by explicit request to control plane on
|
||||
start.
|
||||
|
||||
Requirement for deterministic layer generation was considered overly strict
|
||||
because of two reasons:
|
||||
|
||||
- It can limit possible optimizations e g when pageserver wants to reshuffle
|
||||
some data locally and doesnt want to coordinate this
|
||||
- The deterministic algorithm itself can change so during deployments for some
|
||||
time there will be two different version running at the same time which can
|
||||
cause non determinism
|
||||
|
||||
### External elections
|
||||
|
||||
The above case with lost state in this schema with externally managed
|
||||
leadership is represented like this:
|
||||
|
||||
Note that here we keep objects list in the index file.
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant PS1
|
||||
participant CP as Control Plane
|
||||
participant S3
|
||||
participant PS2
|
||||
|
||||
note over PS1,PS2: PS1 starts up and still a leader
|
||||
PS1->>CP: Am I still the leader for Tenant X?
|
||||
activate CP
|
||||
CP->>PS1: Yes
|
||||
deactivate CP
|
||||
PS1->>S3: Fetch PS1 index.
|
||||
note over PS1: Continue operations, start backround jobs
|
||||
note over PS1,PS2: PS1 starts up and still and is not a leader anymore
|
||||
PS1->>CP: Am I still the leader for Tenant X?
|
||||
CP->>PS1: No
|
||||
PS1->>PS2: Subscribe to index changes
|
||||
PS1->>S3: Fetch PS1 and PS2 indexes
|
||||
note over PS1: Combine index file to include layers <br> from both indexes to be able <br> to see newer files from leader (PS2)
|
||||
note over PS1: Continue operations, do not start background jobs
|
||||
```
|
||||
|
||||
### Internal elections
|
||||
|
||||
To manage leadership internally we can use broker to exchange pings so nodes
|
||||
can decide on the leader roles. In case multiple pageservers are active leader
|
||||
is the one with lowest node id.
|
||||
|
||||
Operations with internally managed elections:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant PS1
|
||||
participant S3
|
||||
|
||||
note over PS1: Starts up
|
||||
note over PS1: Subscribes to changes, waits for two ping <br> timeouts to see if there is a leader
|
||||
PS1->>S3: Fetch indexes from s3
|
||||
alt there is a leader
|
||||
note over PS1: do not start background jobs, <br> continue applying index updates
|
||||
else there is no leader
|
||||
note over PS1: start background jobs, <br> broadcast index changes
|
||||
end
|
||||
|
||||
note over PS1,S3: Then the picture is similar to external elections <br> the difference is that follower can become a leader <br> if there are no pings after some timeout new leader gets elected
|
||||
```
|
||||
|
||||
### Eviction
|
||||
|
||||
When two pageservers operate on a tenant for extended period of time follower
|
||||
doesnt perform write operations in s3. When layer is evicted follower relies
|
||||
on updates from primary to get info about layers it needs to cover range for
|
||||
evicted layer.
|
||||
|
||||
Note that it wont match evicted layer exactly, so layers will overlap and
|
||||
lookup code needs to correctly handle that.
|
||||
|
||||
### Relocation flow
|
||||
|
||||
Actions become:
|
||||
|
||||
- Attach tenant to new pageserver
|
||||
- New pageserver becomes follower since previous one is still leading
|
||||
- New pageserver starts replicating from safekeepers but does not upload layers
|
||||
- Detach is called on the old one
|
||||
- New pageserver becomes leader after it realizes that old one disappeared
|
||||
|
||||
### Index File
|
||||
|
||||
Using `s3 ls` on startup simplifies things, but we still need metadata, so we
|
||||
need to fetch index files anyway. If they contain list of files we can combine
|
||||
them and avoid costly `s3 ls`
|
||||
|
||||
### Remaining issues
|
||||
|
||||
- More than one remote consistent lsn for safekeepers to know
|
||||
|
||||
Anything else?
|
||||
|
||||
### Proposed solution
|
||||
|
||||
To recap. On meeting we converged on approach with external elections but I
|
||||
think it will be overall harder to manage and will introduce a dependency on
|
||||
control plane for pageserver. Using separate index files for each pageserver
|
||||
consisting of log of operations and a metadata snapshot should be enough.
|
||||
|
||||
### What we need to get there?
|
||||
|
||||
- Change index file structure to contain log of changes instead of just the
|
||||
file list
|
||||
- Implement pinging/elections for pageservers
|
||||
@@ -52,6 +52,10 @@ PostgreSQL extension that implements storage manager API and network communicati
|
||||
|
||||
PostgreSQL extension that contains functions needed for testing and debugging.
|
||||
|
||||
`/pgxn/neon_walredo`:
|
||||
|
||||
Library to run Postgres as a "WAL redo process" in the pageserver.
|
||||
|
||||
`/safekeeper`:
|
||||
|
||||
The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
|
||||
|
||||
@@ -29,6 +29,9 @@ pub struct SkTimelineInfo {
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub peer_horizon_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub local_start_lsn: Option<Lsn>,
|
||||
/// A connection string to use for WAL receiving.
|
||||
#[serde(default)]
|
||||
pub safekeeper_connstr: Option<String>,
|
||||
|
||||
@@ -7,6 +7,9 @@ edition = "2021"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "2.0"
|
||||
const_format = "0.2.21"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
bytes = "1.0.1"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -2,6 +2,7 @@ use const_format::formatcp;
|
||||
|
||||
/// Public API types
|
||||
pub mod models;
|
||||
pub mod reltag;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
|
||||
@@ -7,6 +7,10 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::reltag::RelTag;
|
||||
use anyhow::bail;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
|
||||
/// A state of a tenant in pageserver's memory.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum TenantState {
|
||||
@@ -19,6 +23,22 @@ pub enum TenantState {
|
||||
Broken,
|
||||
}
|
||||
|
||||
/// A state of a timeline in pageserver's memory.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum TimelineState {
|
||||
/// Timeline is fully operational, its background jobs are running.
|
||||
Active,
|
||||
/// A timeline is recognized by pageserver, but not yet ready to operate.
|
||||
/// The status indicates, that the timeline could eventually go back to Active automatically:
|
||||
/// for example, if the owning tenant goes back to Active again.
|
||||
Suspended,
|
||||
/// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
|
||||
/// automatically become Active after certain events: only a management call can change this status.
|
||||
Paused,
|
||||
/// A timeline is recognized by the pageserver, but no longer used for any operations, as failed to get activated.
|
||||
Broken,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TimelineCreateRequest {
|
||||
@@ -160,6 +180,8 @@ pub struct TimelineInfo {
|
||||
pub remote_consistent_lsn: Option<Lsn>,
|
||||
pub awaits_download: bool,
|
||||
|
||||
pub state: TimelineState,
|
||||
|
||||
// Some of the above fields are duplicated in 'local' and 'remote', for backwards-
|
||||
// compatility with older clients.
|
||||
pub local: LocalTimelineInfo,
|
||||
@@ -201,3 +223,160 @@ pub struct FailpointConfig {
|
||||
pub struct TimelineGcRequest {
|
||||
pub gc_horizon: Option<u64>,
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
pub enum PagestreamFeMessage {
|
||||
Exists(PagestreamExistsRequest),
|
||||
Nblocks(PagestreamNblocksRequest),
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
pub enum PagestreamBeMessage {
|
||||
Exists(PagestreamExistsResponse),
|
||||
Nblocks(PagestreamNblocksResponse),
|
||||
GetPage(PagestreamGetPageResponse),
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamExistsRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamNblocksRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamGetPageRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
pub rel: RelTag,
|
||||
pub blkno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamDbSizeRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
pub dbnode: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamExistsResponse {
|
||||
pub exists: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamNblocksResponse {
|
||||
pub n_blocks: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamGetPageResponse {
|
||||
pub page: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamErrorResponse {
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamDbSizeResponse {
|
||||
pub db_size: i64,
|
||||
}
|
||||
|
||||
impl PagestreamFeMessage {
|
||||
pub fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// TODO these gets can fail
|
||||
|
||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||
//
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
let msg_tag = body.get_u8();
|
||||
match msg_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
rel: RelTag {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
},
|
||||
})),
|
||||
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
rel: RelTag {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
},
|
||||
})),
|
||||
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
rel: RelTag {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
},
|
||||
blkno: body.get_u32(),
|
||||
})),
|
||||
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
dbnode: body.get_u32(),
|
||||
})),
|
||||
_ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamBeMessage {
|
||||
pub fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
match self {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(100); /* tag from pagestore_client.h */
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(101); /* tag from pagestore_client.h */
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(102); /* tag from pagestore_client.h */
|
||||
bytes.put(&resp.page[..]);
|
||||
}
|
||||
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(103); /* tag from pagestore_client.h */
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(104); /* tag from pagestore_client.h */
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ thiserror = "1.0"
|
||||
tokio = { version = "1.17", features = ["macros"]}
|
||||
tokio-rustls = "0.23"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
|
||||
nix = "0.25"
|
||||
signal-hook = "0.3.10"
|
||||
rand = "0.8.3"
|
||||
@@ -30,6 +30,8 @@ rustls-split = "0.3.0"
|
||||
git-version = "0.3.5"
|
||||
serde_with = "2.0"
|
||||
once_cell = "1.13.0"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
|
||||
|
||||
metrics = { path = "../metrics" }
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
use postgres::Config;
|
||||
|
||||
pub fn connection_host_port(config: &Config) -> (String, u16) {
|
||||
assert_eq!(
|
||||
config.get_hosts().len(),
|
||||
1,
|
||||
"only one pair of host and port is supported in connection string"
|
||||
);
|
||||
assert_eq!(
|
||||
config.get_ports().len(),
|
||||
1,
|
||||
"only one pair of host and port is supported in connection string"
|
||||
);
|
||||
let host = match &config.get_hosts()[0] {
|
||||
postgres::config::Host::Tcp(host) => host.as_ref(),
|
||||
postgres::config::Host::Unix(host) => host.to_str().unwrap(),
|
||||
};
|
||||
(host.to_owned(), config.get_ports()[0])
|
||||
}
|
||||
|
||||
pub fn connection_address(config: &Config) -> String {
|
||||
let (host, port) = connection_host_port(config);
|
||||
format!("{}:{}", host, port)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_connection_host_port() {
|
||||
let config: Config = "postgresql://no_user@localhost:64000/no_db"
|
||||
.parse()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
connection_host_port(&config),
|
||||
("localhost".to_owned(), 64000)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "only one pair of host and port is supported in connection string")]
|
||||
fn test_connection_host_port_multiple_ports() {
|
||||
let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db"
|
||||
.parse()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
connection_host_port(&config),
|
||||
("localhost".to_owned(), 64000)
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -75,6 +75,12 @@ impl From<[u8; 16]> for Id {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Id> for u128 {
|
||||
fn from(id: Id) -> Self {
|
||||
u128::from_le_bytes(id.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Id {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(&self.hex_encode())
|
||||
@@ -136,6 +142,12 @@ macro_rules! id_newtype {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<$t> for u128 {
|
||||
fn from(id: $t) -> Self {
|
||||
u128::from(id.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for $t {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.0.fmt(f)
|
||||
|
||||
@@ -19,9 +19,6 @@ pub mod postgres_backend;
|
||||
pub mod postgres_backend_async;
|
||||
pub mod pq_proto;
|
||||
|
||||
// dealing with connstring parsing and handy access to it's parts
|
||||
pub mod connstring;
|
||||
|
||||
// helper functions for creating and fsyncing
|
||||
pub mod crashsafe;
|
||||
|
||||
@@ -39,6 +36,8 @@ pub mod sock_split;
|
||||
// common log initialisation routine
|
||||
pub mod logging;
|
||||
|
||||
pub mod lock_file;
|
||||
|
||||
// Misc
|
||||
pub mod accum;
|
||||
pub mod shutdown;
|
||||
|
||||
81
libs/utils/src/lock_file.rs
Normal file
81
libs/utils/src/lock_file.rs
Normal file
@@ -0,0 +1,81 @@
|
||||
//! A module to create and read lock files. A lock file ensures that only one
|
||||
//! process is running at a time, in a particular directory.
|
||||
//!
|
||||
//! File locking is done using [`fcntl::flock`], which means that holding the
|
||||
//! lock on file only prevents acquiring another lock on it; all other
|
||||
//! operations are still possible on files. Other process can still open, read,
|
||||
//! write, or remove the file, for example.
|
||||
//! If the file is removed while a process is holding a lock on it,
|
||||
//! the process that holds the lock does not get any error or notification.
|
||||
//! Furthermore, you can create a new file with the same name and lock the new file,
|
||||
//! while the old process is still running.
|
||||
//! Deleting the lock file while the locking process is still running is a bad idea!
|
||||
|
||||
use std::{fs, os::unix::prelude::AsRawFd, path::Path};
|
||||
|
||||
use anyhow::Context;
|
||||
use nix::fcntl;
|
||||
|
||||
use crate::crashsafe;
|
||||
|
||||
pub enum LockCreationResult {
|
||||
Created {
|
||||
new_lock_contents: String,
|
||||
file: fs::File,
|
||||
},
|
||||
AlreadyLocked {
|
||||
existing_lock_contents: String,
|
||||
},
|
||||
CreationFailed(anyhow::Error),
|
||||
}
|
||||
|
||||
/// Creates a lock file in the path given and writes the given contents into the file.
|
||||
/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
|
||||
pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
|
||||
let lock_file = match fs::OpenOptions::new()
|
||||
.create(true) // O_CREAT
|
||||
.write(true)
|
||||
.open(lock_file_path)
|
||||
.context("Failed to open lock file")
|
||||
{
|
||||
Ok(file) => file,
|
||||
Err(e) => return LockCreationResult::CreationFailed(e),
|
||||
};
|
||||
|
||||
match fcntl::flock(
|
||||
lock_file.as_raw_fd(),
|
||||
fcntl::FlockArg::LockExclusiveNonblock,
|
||||
) {
|
||||
Ok(()) => {
|
||||
match lock_file
|
||||
.set_len(0)
|
||||
.context("Failed to truncate lockfile")
|
||||
.and_then(|()| {
|
||||
fs::write(lock_file_path, &contents).with_context(|| {
|
||||
format!("Failed to write '{contents}' contents into lockfile")
|
||||
})
|
||||
})
|
||||
.and_then(|()| {
|
||||
crashsafe::fsync_file_and_parent(lock_file_path)
|
||||
.context("Failed to fsync lockfile")
|
||||
}) {
|
||||
Ok(()) => LockCreationResult::Created {
|
||||
new_lock_contents: contents,
|
||||
file: lock_file,
|
||||
},
|
||||
Err(e) => LockCreationResult::CreationFailed(e),
|
||||
}
|
||||
}
|
||||
Err(nix::errno::Errno::EAGAIN) => {
|
||||
match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
|
||||
Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
|
||||
existing_lock_contents,
|
||||
},
|
||||
Err(e) => LockCreationResult::CreationFailed(e),
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,19 +1,28 @@
|
||||
use std::{
|
||||
fs::{File, OpenOptions},
|
||||
path::Path,
|
||||
};
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::Context;
|
||||
use strum_macros::{EnumString, EnumVariantNames};
|
||||
|
||||
pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {
|
||||
// Don't open the same file for output multiple times;
|
||||
// the different fds could overwrite each other's output.
|
||||
let log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?;
|
||||
#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub enum LogFormat {
|
||||
Plain,
|
||||
Json,
|
||||
}
|
||||
|
||||
impl LogFormat {
|
||||
pub fn from_config(s: &str) -> anyhow::Result<LogFormat> {
|
||||
use strum::VariantNames;
|
||||
LogFormat::from_str(s).with_context(|| {
|
||||
format!(
|
||||
"Unrecognized log format. Please specify one of: {:?}",
|
||||
LogFormat::VARIANTS
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
|
||||
let default_filter_str = "info";
|
||||
|
||||
// We fall back to printing all spans at info-level or above if
|
||||
@@ -23,20 +32,14 @@ pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {
|
||||
|
||||
let base_logger = tracing_subscriber::fmt()
|
||||
.with_env_filter(env_filter)
|
||||
.with_target(false) // don't include event targets
|
||||
.with_ansi(false); // don't use colors in log file;
|
||||
.with_target(false)
|
||||
.with_ansi(false)
|
||||
.with_writer(std::io::stdout);
|
||||
|
||||
// we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
|
||||
// if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
|
||||
// for example to be in line with docker log command which expects logs comimg from stdout
|
||||
if daemonize {
|
||||
let x = log_file.try_clone().unwrap();
|
||||
base_logger
|
||||
.with_writer(move || x.try_clone().unwrap())
|
||||
.init();
|
||||
} else {
|
||||
base_logger.init();
|
||||
match log_format {
|
||||
LogFormat::Json => base_logger.json().init(),
|
||||
LogFormat::Plain => base_logger.init(),
|
||||
}
|
||||
|
||||
Ok(log_file)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -24,7 +24,6 @@ hex = "0.4.3"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
clap = { version = "4.0", features = ["string"] }
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
@@ -67,6 +66,7 @@ remote_storage = { path = "../libs/remote_storage" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
close_fds = "0.3.2"
|
||||
walkdir = "2.3.2"
|
||||
svg_fmt = "0.4.1"
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.4"
|
||||
|
||||
@@ -22,8 +22,8 @@ use std::time::SystemTime;
|
||||
use tar::{Builder, EntryType, Header};
|
||||
use tracing::*;
|
||||
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::tenant::Timeline;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
|
||||
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||
use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
|
||||
|
||||
150
pageserver/src/bin/draw_timeline_dir.rs
Normal file
150
pageserver/src/bin/draw_timeline_dir.rs
Normal file
@@ -0,0 +1,150 @@
|
||||
//! A tool for visualizing the arrangement of layerfiles within a timeline.
|
||||
//!
|
||||
//! It reads filenames from stdin and prints a svg on stdout. The image is a plot in
|
||||
//! page-lsn space, where every delta layer is a rectangle and every image layer is a
|
||||
//! thick line. Legend:
|
||||
//! - The x axis (left to right) represents page index.
|
||||
//! - The y axis represents LSN, growing upwards.
|
||||
//!
|
||||
//! Coordinates in both axis are compressed for better readability.
|
||||
//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
|
||||
//!
|
||||
//! Example use:
|
||||
//! ```
|
||||
//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
|
||||
//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
|
||||
//! $ firefox out.svg
|
||||
//! ```
|
||||
//!
|
||||
//! This API was chosen so that we can easily work with filenames extracted from ssh,
|
||||
//! or from pageserver log files.
|
||||
//!
|
||||
//! TODO Consider shipping this as a grafana panel plugin:
|
||||
//! https://grafana.com/tutorials/build-a-panel-plugin/
|
||||
use anyhow::Result;
|
||||
use pageserver::repository::Key;
|
||||
use std::cmp::Ordering;
|
||||
use std::io::{self, BufRead};
|
||||
use std::{
|
||||
collections::{BTreeMap, BTreeSet},
|
||||
ops::Range,
|
||||
};
|
||||
use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke};
|
||||
use utils::{lsn::Lsn, project_git_version};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
// Map values to their compressed coordinate - the index the value
|
||||
// would have in a sorted and deduplicated list of all values.
|
||||
fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T, usize> {
|
||||
let set: BTreeSet<T> = coords.into_iter().collect();
|
||||
|
||||
let mut map: BTreeMap<T, usize> = BTreeMap::new();
|
||||
for (i, e) in set.iter().enumerate() {
|
||||
map.insert(*e, i);
|
||||
}
|
||||
|
||||
map
|
||||
}
|
||||
|
||||
fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
|
||||
let split: Vec<&str> = name.split("__").collect();
|
||||
let keys: Vec<&str> = split[0].split('-').collect();
|
||||
let mut lsns: Vec<&str> = split[1].split('-').collect();
|
||||
if lsns.len() == 1 {
|
||||
lsns.push(lsns[0]);
|
||||
}
|
||||
|
||||
let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
|
||||
let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
|
||||
(keys, lsns)
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// Parse layer filenames from stdin
|
||||
let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
|
||||
let stdin = io::stdin();
|
||||
for line in stdin.lock().lines() {
|
||||
let range = parse_filename(&line.unwrap());
|
||||
ranges.push(range);
|
||||
}
|
||||
|
||||
// Collect all coordinates
|
||||
let mut keys: Vec<Key> = vec![];
|
||||
let mut lsns: Vec<Lsn> = vec![];
|
||||
for (keyr, lsnr) in &ranges {
|
||||
keys.push(keyr.start);
|
||||
keys.push(keyr.end);
|
||||
lsns.push(lsnr.start);
|
||||
lsns.push(lsnr.end);
|
||||
}
|
||||
|
||||
// Analyze
|
||||
let key_map = build_coordinate_compression_map(keys);
|
||||
let lsn_map = build_coordinate_compression_map(lsns);
|
||||
|
||||
// Initialize stats
|
||||
let mut num_deltas = 0;
|
||||
let mut num_images = 0;
|
||||
|
||||
// Draw
|
||||
let stretch = 3.0; // Stretch out vertically for better visibility
|
||||
println!(
|
||||
"{}",
|
||||
BeginSvg {
|
||||
w: key_map.len() as f32,
|
||||
h: stretch * lsn_map.len() as f32
|
||||
}
|
||||
);
|
||||
for (keyr, lsnr) in &ranges {
|
||||
let key_start = *key_map.get(&keyr.start).unwrap();
|
||||
let key_end = *key_map.get(&keyr.end).unwrap();
|
||||
let key_diff = key_end - key_start;
|
||||
let lsn_max = lsn_map.len();
|
||||
|
||||
if key_start >= key_end {
|
||||
panic!("Invalid key range {}-{}", key_start, key_end);
|
||||
}
|
||||
|
||||
let lsn_start = *lsn_map.get(&lsnr.start).unwrap();
|
||||
let lsn_end = *lsn_map.get(&lsnr.end).unwrap();
|
||||
|
||||
let mut lsn_diff = (lsn_end - lsn_start) as f32;
|
||||
let mut fill = Fill::None;
|
||||
let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
|
||||
let mut lsn_offset = 0.0;
|
||||
|
||||
// Fill in and thicken rectangle if it's an
|
||||
// image layer so that we can see it.
|
||||
match lsn_start.cmp(&lsn_end) {
|
||||
Ordering::Less => num_deltas += 1,
|
||||
Ordering::Equal => {
|
||||
num_images += 1;
|
||||
lsn_diff = 0.3;
|
||||
lsn_offset = -lsn_diff / 2.0;
|
||||
margin = 0.05;
|
||||
fill = Fill::Color(rgb(0, 0, 0));
|
||||
}
|
||||
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
|
||||
}
|
||||
|
||||
println!(
|
||||
" {}",
|
||||
rectangle(
|
||||
key_start as f32 + stretch * margin,
|
||||
stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
|
||||
key_diff as f32 - stretch * 2.0 * margin,
|
||||
stretch * (lsn_diff - 2.0 * margin)
|
||||
)
|
||||
.fill(fill)
|
||||
.stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
|
||||
.border_radius(0.4)
|
||||
);
|
||||
}
|
||||
println!("{}", EndSvg);
|
||||
|
||||
eprintln!("num_images: {}", num_images);
|
||||
eprintln!("num_deltas: {}", num_deltas);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,17 +1,14 @@
|
||||
//! Main entry point for the Page Server executable.
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use std::{env, ops::ControlFlow, path::Path, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use fail::FailScenario;
|
||||
use nix::unistd::Pid;
|
||||
use tracing::*;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use fail::FailScenario;
|
||||
use metrics::set_build_info_metric;
|
||||
|
||||
use pageserver::{
|
||||
config::{defaults::*, PageServerConf},
|
||||
http, page_cache, page_service, profiling, task_mgr,
|
||||
@@ -19,20 +16,22 @@ use pageserver::{
|
||||
task_mgr::{
|
||||
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
|
||||
},
|
||||
tenant_mgr, virtual_file, LOG_FILE_NAME,
|
||||
tenant_mgr, virtual_file,
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
logging,
|
||||
lock_file, logging,
|
||||
postgres_backend::AuthType,
|
||||
project_git_version,
|
||||
shutdown::exit_now,
|
||||
signals::{self, Signal},
|
||||
tcp_listener,
|
||||
};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
|
||||
const FEATURES: &[&str] = &[
|
||||
#[cfg(feature = "testing")]
|
||||
"testing",
|
||||
@@ -65,6 +64,7 @@ fn main() -> anyhow::Result<()> {
|
||||
let workdir = workdir
|
||||
.canonicalize()
|
||||
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
|
||||
|
||||
let cfg_file_path = workdir.join("pageserver.toml");
|
||||
|
||||
// Set CWD to workdir for non-daemon modes
|
||||
@@ -75,8 +75,6 @@ fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
})?;
|
||||
|
||||
let daemonize = arg_matches.get_flag("daemonize");
|
||||
|
||||
let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
|
||||
ControlFlow::Continue(conf) => conf,
|
||||
ControlFlow::Break(()) => {
|
||||
@@ -102,7 +100,7 @@ fn main() -> anyhow::Result<()> {
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
|
||||
start_pageserver(conf).context("Failed to start pageserver")?;
|
||||
|
||||
scenario.teardown();
|
||||
Ok(())
|
||||
@@ -197,12 +195,34 @@ fn initialize_config(
|
||||
})
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
|
||||
// Initialize logger
|
||||
let log_file = logging::init(LOG_FILE_NAME, daemonize)?;
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
logging::init(conf.log_format)?;
|
||||
info!("version: {}", version());
|
||||
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
|
||||
lock_file::LockCreationResult::Created {
|
||||
new_lock_contents,
|
||||
file,
|
||||
} => {
|
||||
info!("Created lock file at {lock_file_path:?} with contents {new_lock_contents}");
|
||||
file
|
||||
}
|
||||
lock_file::LockCreationResult::AlreadyLocked {
|
||||
existing_lock_contents,
|
||||
} => anyhow::bail!(
|
||||
"Could not lock pid file; pageserver is already running in {:?} with PID {}",
|
||||
conf.workdir,
|
||||
existing_lock_contents
|
||||
),
|
||||
lock_file::LockCreationResult::CreationFailed(e) => {
|
||||
return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
|
||||
}
|
||||
};
|
||||
// ensure that the lock file is held even if the main thread of the process is panics
|
||||
// we need to release the lock file only when the current process is gone
|
||||
let _ = Box::leak(Box::new(lock_file));
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
// bind sockets before daemonizing so we report errors early and do not return until we are listening
|
||||
@@ -218,33 +238,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
);
|
||||
let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
|
||||
|
||||
// NB: Don't spawn any threads before daemonizing!
|
||||
if daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
// There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
let stdout = log_file
|
||||
.try_clone()
|
||||
.with_context(|| format!("Failed to clone log file '{:?}'", log_file))?;
|
||||
let stderr = log_file;
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file("pageserver.pid")
|
||||
.working_directory(".")
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
|
||||
// XXX: The parent process should exit abruptly right after
|
||||
// it has spawned a child to prevent coverage machinery from
|
||||
// dumping stats into a `profraw` file now owned by the child.
|
||||
// Otherwise, the coverage data will be damaged.
|
||||
match daemonize.exit_action(|| exit_now(0)).start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(err) => bail!("{err}. could not daemonize. bailing."),
|
||||
}
|
||||
}
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
|
||||
// start profiler (if enabled)
|
||||
@@ -347,14 +340,6 @@ fn cli() -> Command {
|
||||
Command::new("Neon page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.version(version())
|
||||
.arg(
|
||||
|
||||
Arg::new("daemonize")
|
||||
.short('d')
|
||||
.long("daemonize")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Run in the background"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("init")
|
||||
.long("init")
|
||||
|
||||
@@ -17,6 +17,7 @@ use toml_edit::{Document, Item};
|
||||
use url::Url;
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
logging::LogFormat,
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
@@ -45,6 +46,8 @@ pub mod defaults {
|
||||
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
|
||||
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
|
||||
|
||||
pub const DEFAULT_LOG_FORMAT: &str = "plain";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -63,6 +66,7 @@ pub mod defaults {
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
#initial_superuser_name = '{DEFAULT_SUPERUSER}'
|
||||
|
||||
#log_format = '{DEFAULT_LOG_FORMAT}'
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -126,6 +130,8 @@ pub struct PageServerConf {
|
||||
|
||||
/// Etcd broker endpoints to connect to.
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
|
||||
pub log_format: LogFormat,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
@@ -192,6 +198,8 @@ struct PageServerConfigBuilder {
|
||||
profiling: BuilderValue<ProfilingConfig>,
|
||||
broker_etcd_prefix: BuilderValue<String>,
|
||||
broker_endpoints: BuilderValue<Vec<Url>>,
|
||||
|
||||
log_format: BuilderValue<LogFormat>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -219,6 +227,7 @@ impl Default for PageServerConfigBuilder {
|
||||
profiling: Set(ProfilingConfig::Disabled),
|
||||
broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
|
||||
broker_endpoints: Set(Vec::new()),
|
||||
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -291,6 +300,10 @@ impl PageServerConfigBuilder {
|
||||
self.profiling = BuilderValue::Set(profiling)
|
||||
}
|
||||
|
||||
pub fn log_format(&mut self, log_format: LogFormat) {
|
||||
self.log_format = BuilderValue::Set(log_format)
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let broker_endpoints = self
|
||||
.broker_endpoints
|
||||
@@ -335,6 +348,7 @@ impl PageServerConfigBuilder {
|
||||
broker_etcd_prefix: self
|
||||
.broker_etcd_prefix
|
||||
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
|
||||
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -459,6 +473,9 @@ impl PageServerConf {
|
||||
})
|
||||
.collect::<anyhow::Result<_>>()?,
|
||||
),
|
||||
"log_format" => builder.log_format(
|
||||
LogFormat::from_config(&parse_toml_string(key, item)?)?
|
||||
),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -571,6 +588,7 @@ impl PageServerConf {
|
||||
default_tenant_conf: TenantConf::dummy_conf(),
|
||||
broker_endpoints: Vec::new(),
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -665,6 +683,8 @@ max_file_descriptors = 333
|
||||
initial_superuser_name = 'zzzz'
|
||||
id = 10
|
||||
|
||||
log_format = 'json'
|
||||
|
||||
"#;
|
||||
|
||||
#[test]
|
||||
@@ -704,6 +724,7 @@ id = 10
|
||||
.parse()
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -748,6 +769,7 @@ id = 10
|
||||
.parse()
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::Json,
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -618,6 +618,7 @@ components:
|
||||
- last_record_lsn
|
||||
- disk_consistent_lsn
|
||||
- awaits_download
|
||||
- state
|
||||
properties:
|
||||
timeline_id:
|
||||
type: string
|
||||
@@ -660,6 +661,8 @@ components:
|
||||
type: integer
|
||||
awaits_download:
|
||||
type: boolean
|
||||
state:
|
||||
type: string
|
||||
|
||||
# These 'local' and 'remote' fields just duplicate some of the fields
|
||||
# above. They are kept for backwards-compatibility. They can be removed,
|
||||
|
||||
@@ -129,6 +129,7 @@ async fn build_timeline_info(
|
||||
}
|
||||
};
|
||||
let current_physical_size = Some(timeline.get_physical_size());
|
||||
let state = timeline.current_state();
|
||||
|
||||
let info = TimelineInfo {
|
||||
tenant_id: timeline.tenant_id,
|
||||
@@ -158,6 +159,7 @@ async fn build_timeline_info(
|
||||
|
||||
remote_consistent_lsn,
|
||||
awaits_download,
|
||||
state,
|
||||
|
||||
// Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
|
||||
// with the control plane.
|
||||
@@ -225,13 +227,10 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timelines = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
|
||||
let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
Ok(tenant.list_timelines())
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
})?;
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
@@ -294,7 +293,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
|
||||
let timeline_info = async {
|
||||
let timeline = tokio::task::spawn_blocking(move || {
|
||||
tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id)
|
||||
tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id, false)
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
@@ -331,14 +330,13 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
|
||||
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
||||
|
||||
let timeline = tenant_mgr::get_tenant(tenant_id, true)
|
||||
.and_then(|tenant| tenant.get_timeline(timeline_id))
|
||||
.with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
|
||||
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let result = match timeline
|
||||
.find_lsn_for_timestamp(timestamp_pg)
|
||||
.map_err(ApiError::InternalServerError)?
|
||||
{
|
||||
LsnForTimestamp::Present(lsn) => format!("{}", lsn),
|
||||
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
|
||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||
@@ -522,9 +520,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
// if tenant is in progress of downloading it can be absent in global tenant map
|
||||
let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false))
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false);
|
||||
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
@@ -788,16 +784,16 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
// FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX
|
||||
let repo = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let _span_guard =
|
||||
info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| repo.get_gc_horizon());
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
|
||||
// Use tenant's pitr setting
|
||||
let pitr = repo.get_pitr_interval();
|
||||
let result = repo
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
@@ -812,10 +808,9 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let timeline = repo
|
||||
.get_timeline(timeline_id)
|
||||
.with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
timeline.compact().map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -829,10 +824,9 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let timeline = repo
|
||||
.get_timeline(timeline_id)
|
||||
.with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
timeline
|
||||
.checkpoint(CheckpointConfig::Forced)
|
||||
|
||||
@@ -12,10 +12,10 @@ use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walrecord::DecodedWALRecord;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
|
||||
@@ -8,7 +8,6 @@ pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod profiling;
|
||||
pub mod reltag;
|
||||
pub mod repository;
|
||||
pub mod storage_sync;
|
||||
pub mod task_mgr;
|
||||
@@ -44,8 +43,6 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
|
||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||
pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
||||
|
||||
pub const LOG_FILE_NAME: &str = "pageserver.log";
|
||||
|
||||
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
/// Config for the Repository checkpointer
|
||||
@@ -82,7 +79,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
|
||||
// There should be nothing left, but let's be sure
|
||||
task_mgr::shutdown_tasks(None, None, None).await;
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
@@ -10,8 +10,14 @@
|
||||
//
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use bytes::Bytes;
|
||||
use futures::{Stream, StreamExt};
|
||||
use pageserver_api::models::{
|
||||
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
|
||||
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
|
||||
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
|
||||
PagestreamNblocksRequest, PagestreamNblocksResponse,
|
||||
};
|
||||
use std::io;
|
||||
use std::net::TcpListener;
|
||||
use std::str;
|
||||
@@ -35,7 +41,6 @@ use crate::config::{PageServerConf, ProfilingConfig};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||
use crate::profiling::profpoint_start;
|
||||
use crate::reltag::RelTag;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::Timeline;
|
||||
@@ -45,163 +50,6 @@ use crate::CheckpointConfig;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
enum PagestreamFeMessage {
|
||||
Exists(PagestreamExistsRequest),
|
||||
Nblocks(PagestreamNblocksRequest),
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
enum PagestreamBeMessage {
|
||||
Exists(PagestreamExistsResponse),
|
||||
Nblocks(PagestreamNblocksResponse),
|
||||
GetPage(PagestreamGetPageResponse),
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamExistsRequest {
|
||||
latest: bool,
|
||||
lsn: Lsn,
|
||||
rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamNblocksRequest {
|
||||
latest: bool,
|
||||
lsn: Lsn,
|
||||
rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamGetPageRequest {
|
||||
latest: bool,
|
||||
lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blkno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamDbSizeRequest {
|
||||
latest: bool,
|
||||
lsn: Lsn,
|
||||
dbnode: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamExistsResponse {
|
||||
exists: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamNblocksResponse {
|
||||
n_blocks: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamGetPageResponse {
|
||||
page: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamErrorResponse {
|
||||
message: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamDbSizeResponse {
|
||||
db_size: i64,
|
||||
}
|
||||
|
||||
impl PagestreamFeMessage {
|
||||
fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// TODO these gets can fail
|
||||
|
||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||
//
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
let msg_tag = body.get_u8();
|
||||
match msg_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
rel: RelTag {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
},
|
||||
})),
|
||||
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
rel: RelTag {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
},
|
||||
})),
|
||||
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
rel: RelTag {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
},
|
||||
blkno: body.get_u32(),
|
||||
})),
|
||||
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
dbnode: body.get_u32(),
|
||||
})),
|
||||
_ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamBeMessage {
|
||||
fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
match self {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(100); /* tag from pagestore_client.h */
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(101); /* tag from pagestore_client.h */
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(102); /* tag from pagestore_client.h */
|
||||
bytes.put(&resp.page[..]);
|
||||
}
|
||||
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(103); /* tag from pagestore_client.h */
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(104); /* tag from pagestore_client.h */
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
}
|
||||
|
||||
fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
|
||||
async_stream::try_stream! {
|
||||
loop {
|
||||
@@ -1060,7 +908,8 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
}
|
||||
|
||||
fn get_local_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> Result<Arc<Timeline>> {
|
||||
tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id))
|
||||
tenant_mgr::get_tenant(tenant_id, true)
|
||||
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
@@ -7,12 +7,12 @@
|
||||
//! Clarify that)
|
||||
//!
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::repository::*;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{Oid, TimestampTz, TransactionId};
|
||||
|
||||
@@ -11,7 +11,8 @@
|
||||
//! parent timeline, and the last LSN that has been written to disk.
|
||||
//!
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use anyhow::{bail, Context};
|
||||
use pageserver_api::models::TimelineState;
|
||||
use tokio::sync::watch;
|
||||
use tracing::*;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
@@ -25,7 +26,6 @@ use std::fs::File;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::num::NonZeroU64;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
@@ -190,6 +190,7 @@ impl UninitializedTimeline<'_> {
|
||||
"Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
|
||||
)
|
||||
})?;
|
||||
new_timeline.set_state(TimelineState::Active);
|
||||
v.insert(Arc::clone(&new_timeline));
|
||||
new_timeline.launch_wal_receiver();
|
||||
}
|
||||
@@ -292,7 +293,7 @@ impl TimelineUninitMark {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete_mark_file_if_present(&mut self) -> Result<(), anyhow::Error> {
|
||||
fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
|
||||
let uninit_mark_file = &self.uninit_mark_path;
|
||||
let uninit_mark_parent = uninit_mark_file
|
||||
.parent()
|
||||
@@ -339,18 +340,26 @@ impl Tenant {
|
||||
|
||||
/// Get Timeline handle for given Neon timeline ID.
|
||||
/// This function is idempotent. It doesn't change internal state in any way.
|
||||
pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<Arc<Timeline>> {
|
||||
self.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(&timeline_id)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Timeline {} was not found for tenant {}",
|
||||
timeline_id, self.tenant_id
|
||||
)
|
||||
})
|
||||
.map(Arc::clone)
|
||||
pub fn get_timeline(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
active_only: bool,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let timelines_accessor = self.timelines.lock().unwrap();
|
||||
let timeline = timelines_accessor.get(&timeline_id).with_context(|| {
|
||||
format!("Timeline {}/{} was not found", self.tenant_id, timeline_id)
|
||||
})?;
|
||||
|
||||
if active_only && !timeline.is_active() {
|
||||
anyhow::bail!(
|
||||
"Timeline {}/{} is not active, state: {:?}",
|
||||
self.tenant_id,
|
||||
timeline_id,
|
||||
timeline.current_state()
|
||||
)
|
||||
} else {
|
||||
Ok(Arc::clone(timeline))
|
||||
}
|
||||
}
|
||||
|
||||
/// Lists timelines the tenant contains.
|
||||
@@ -373,6 +382,11 @@ impl Tenant {
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<UninitializedTimeline> {
|
||||
anyhow::ensure!(
|
||||
self.is_active(),
|
||||
"Cannot create empty timelines on inactive tenant"
|
||||
);
|
||||
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?;
|
||||
drop(timelines);
|
||||
@@ -409,9 +423,14 @@ impl Tenant {
|
||||
mut ancestor_start_lsn: Option<Lsn>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<Arc<Timeline>>> {
|
||||
anyhow::ensure!(
|
||||
self.is_active(),
|
||||
"Cannot create timelines on inactive tenant"
|
||||
);
|
||||
|
||||
let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate);
|
||||
|
||||
if self.get_timeline(new_timeline_id).is_ok() {
|
||||
if self.get_timeline(new_timeline_id, false).is_ok() {
|
||||
debug!("timeline {new_timeline_id} already exists");
|
||||
return Ok(None);
|
||||
}
|
||||
@@ -419,7 +438,7 @@ impl Tenant {
|
||||
let loaded_timeline = match ancestor_timeline_id {
|
||||
Some(ancestor_timeline_id) => {
|
||||
let ancestor_timeline = self
|
||||
.get_timeline(ancestor_timeline_id)
|
||||
.get_timeline(ancestor_timeline_id, false)
|
||||
.context("Cannot branch off the timeline that's not present in pageserver")?;
|
||||
|
||||
if let Some(lsn) = ancestor_start_lsn.as_mut() {
|
||||
@@ -470,7 +489,12 @@ impl Tenant {
|
||||
horizon: u64,
|
||||
pitr: Duration,
|
||||
checkpoint_before_gc: bool,
|
||||
) -> Result<GcResult> {
|
||||
) -> anyhow::Result<GcResult> {
|
||||
anyhow::ensure!(
|
||||
self.is_active(),
|
||||
"Cannot run GC iteration on inactive tenant"
|
||||
);
|
||||
|
||||
let timeline_str = target_timeline_id
|
||||
.map(|x| x.to_string())
|
||||
.unwrap_or_else(|| "-".to_string());
|
||||
@@ -486,7 +510,12 @@ impl Tenant {
|
||||
/// This function is periodically called by compactor task.
|
||||
/// Also it can be explicitly requested per timeline through page server
|
||||
/// api's 'compact' command.
|
||||
pub fn compaction_iteration(&self) -> Result<()> {
|
||||
pub fn compaction_iteration(&self) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(
|
||||
self.is_active(),
|
||||
"Cannot run compaction iteration on inactive tenant"
|
||||
);
|
||||
|
||||
// Scan through the hashmap and collect a list of all the timelines,
|
||||
// while holding the lock. Then drop the lock and actually perform the
|
||||
// compactions. We don't want to block everything else while the
|
||||
@@ -510,19 +539,19 @@ impl Tenant {
|
||||
///
|
||||
/// Used at graceful shutdown.
|
||||
///
|
||||
pub fn checkpoint(&self) -> Result<()> {
|
||||
pub fn checkpoint(&self) -> anyhow::Result<()> {
|
||||
// Scan through the hashmap and collect a list of all the timelines,
|
||||
// while holding the lock. Then drop the lock and actually perform the
|
||||
// checkpoints. We don't want to block everything else while the
|
||||
// checkpoint runs.
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timelines_to_compact = timelines
|
||||
let timelines_to_checkpoint = timelines
|
||||
.iter()
|
||||
.map(|(timeline_id, timeline)| (*timeline_id, Arc::clone(timeline)))
|
||||
.collect::<Vec<_>>();
|
||||
drop(timelines);
|
||||
|
||||
for (timeline_id, timeline) in &timelines_to_compact {
|
||||
for (timeline_id, timeline) in &timelines_to_checkpoint {
|
||||
let _entered =
|
||||
info_span!("checkpoint", timeline = %timeline_id, tenant = %self.tenant_id)
|
||||
.entered();
|
||||
@@ -544,7 +573,7 @@ impl Tenant {
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
|
||||
ensure!(
|
||||
anyhow::ensure!(
|
||||
!children_exist,
|
||||
"Cannot delete timeline which has child timelines"
|
||||
);
|
||||
@@ -553,7 +582,10 @@ impl Tenant {
|
||||
Entry::Vacant(_) => bail!("timeline not found"),
|
||||
};
|
||||
|
||||
let layer_removal_guard = timeline_entry.get().layer_removal_guard()?;
|
||||
let timeline = timeline_entry.get();
|
||||
timeline.set_state(TimelineState::Paused);
|
||||
|
||||
let layer_removal_guard = timeline.layer_removal_guard()?;
|
||||
|
||||
let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
|
||||
std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
|
||||
@@ -570,58 +602,6 @@ impl Tenant {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_attach_timelines(
|
||||
&self,
|
||||
timelines: HashMap<TimelineId, TimelineMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let sorted_timelines = if timelines.len() == 1 {
|
||||
timelines.into_iter().collect()
|
||||
} else if !timelines.is_empty() {
|
||||
tree_sort_timelines(timelines)?
|
||||
} else {
|
||||
warn!("No timelines to attach received");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let mut timelines_accessor = self.timelines.lock().unwrap();
|
||||
for (timeline_id, metadata) in sorted_timelines {
|
||||
info!(
|
||||
"Attaching timeline {} pg_version {}",
|
||||
timeline_id,
|
||||
metadata.pg_version()
|
||||
);
|
||||
|
||||
if timelines_accessor.contains_key(&timeline_id) {
|
||||
warn!(
|
||||
"Timeline {}/{} already exists in the tenant map, skipping its initialization",
|
||||
self.tenant_id, timeline_id
|
||||
);
|
||||
continue;
|
||||
} else {
|
||||
let ancestor = metadata
|
||||
.ancestor_timeline()
|
||||
.and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id))
|
||||
.cloned();
|
||||
let timeline = UninitializedTimeline {
|
||||
owning_tenant: self,
|
||||
timeline_id,
|
||||
raw_timeline: Some((
|
||||
self.create_timeline_data(timeline_id, metadata, ancestor)
|
||||
.with_context(|| {
|
||||
format!("Failed to initialize timeline {timeline_id}")
|
||||
})?,
|
||||
TimelineUninitMark::dummy(),
|
||||
)),
|
||||
};
|
||||
let initialized_timeline =
|
||||
timeline.initialize_with_lock(&mut timelines_accessor, true)?;
|
||||
timelines_accessor.insert(timeline_id, initialized_timeline);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Allows to retrieve remote timeline index from the tenant. Used in walreceiver to grab remote consistent lsn.
|
||||
pub fn get_remote_index(&self) -> &RemoteIndex {
|
||||
&self.remote_index
|
||||
@@ -662,10 +642,30 @@ impl Tenant {
|
||||
}
|
||||
(_, new_state) => {
|
||||
self.state.send_replace(new_state);
|
||||
if self.should_run_tasks() {
|
||||
// Spawn gc and compaction loops. The loops will shut themselves
|
||||
// down when they notice that the tenant is inactive.
|
||||
crate::tenant_tasks::start_background_loops(self.tenant_id);
|
||||
|
||||
let timelines_accessor = self.timelines.lock().unwrap();
|
||||
let not_broken_timelines = timelines_accessor
|
||||
.values()
|
||||
.filter(|timeline| timeline.current_state() != TimelineState::Broken);
|
||||
match new_state {
|
||||
TenantState::Active {
|
||||
background_jobs_running,
|
||||
} => {
|
||||
if background_jobs_running {
|
||||
// Spawn gc and compaction loops. The loops will shut themselves
|
||||
// down when they notice that the tenant is inactive.
|
||||
crate::tenant_tasks::start_background_loops(self.tenant_id);
|
||||
}
|
||||
|
||||
for timeline in not_broken_timelines {
|
||||
timeline.set_state(TimelineState::Active);
|
||||
}
|
||||
}
|
||||
TenantState::Paused | TenantState::Broken => {
|
||||
for timeline in not_broken_timelines {
|
||||
timeline.set_state(TimelineState::Suspended);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -681,7 +681,7 @@ impl Tenant {
|
||||
/// before the children.
|
||||
fn tree_sort_timelines(
|
||||
timelines: HashMap<TimelineId, TimelineMetadata>,
|
||||
) -> Result<Vec<(TimelineId, TimelineMetadata)>> {
|
||||
) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
|
||||
let mut result = Vec::with_capacity(timelines.len());
|
||||
|
||||
let mut now = Vec::with_capacity(timelines.len());
|
||||
@@ -784,27 +784,6 @@ impl Tenant {
|
||||
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
||||
}
|
||||
|
||||
pub fn get_wal_receiver_connect_timeout(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||
tenant_conf
|
||||
.walreceiver_connect_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout)
|
||||
}
|
||||
|
||||
pub fn get_lagging_wal_timeout(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||
tenant_conf
|
||||
.lagging_wal_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout)
|
||||
}
|
||||
|
||||
pub fn get_max_lsn_wal_lag(&self) -> NonZeroU64 {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||
tenant_conf
|
||||
.max_lsn_wal_lag
|
||||
.unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag)
|
||||
}
|
||||
|
||||
pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
|
||||
self.tenant_conf.write().unwrap().update(&new_tenant_conf);
|
||||
}
|
||||
@@ -836,7 +815,7 @@ impl Tenant {
|
||||
))
|
||||
}
|
||||
|
||||
pub fn new(
|
||||
pub(super) fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
@@ -859,7 +838,7 @@ impl Tenant {
|
||||
}
|
||||
|
||||
/// Locate and load config
|
||||
pub fn load_tenant_config(
|
||||
pub(super) fn load_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<TenantConfOpt> {
|
||||
@@ -901,7 +880,7 @@ impl Tenant {
|
||||
Ok(tenant_conf)
|
||||
}
|
||||
|
||||
pub fn persist_tenant_config(
|
||||
pub(super) fn persist_tenant_config(
|
||||
target_config_path: &Path,
|
||||
tenant_conf: TenantConfOpt,
|
||||
first_save: bool,
|
||||
@@ -994,7 +973,7 @@ impl Tenant {
|
||||
horizon: u64,
|
||||
pitr: Duration,
|
||||
checkpoint_before_gc: bool,
|
||||
) -> Result<GcResult> {
|
||||
) -> anyhow::Result<GcResult> {
|
||||
let mut totals: GcResult = Default::default();
|
||||
let now = Instant::now();
|
||||
|
||||
@@ -1048,7 +1027,7 @@ impl Tenant {
|
||||
for timeline_id in timeline_ids {
|
||||
// Timeline is known to be local and loaded.
|
||||
let timeline = self
|
||||
.get_timeline(timeline_id)
|
||||
.get_timeline(timeline_id, false)
|
||||
.with_context(|| format!("Timeline {timeline_id} was not found"))?;
|
||||
|
||||
// If target_timeline is specified, ignore all other timelines
|
||||
@@ -1133,7 +1112,7 @@ impl Tenant {
|
||||
// Step 2 is to avoid initializing the new branch using data removed by past GC iterations
|
||||
// or in-queue GC iterations.
|
||||
|
||||
let src_timeline = self.get_timeline(src).with_context(|| {
|
||||
let src_timeline = self.get_timeline(src, false).with_context(|| {
|
||||
format!(
|
||||
"No ancestor {} found for timeline {}/{}",
|
||||
src, self.tenant_id, dst
|
||||
@@ -1403,6 +1382,68 @@ impl Tenant {
|
||||
|
||||
Ok(uninit_mark)
|
||||
}
|
||||
|
||||
pub(super) fn init_attach_timelines(
|
||||
&self,
|
||||
timelines: HashMap<TimelineId, TimelineMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let sorted_timelines = if timelines.len() == 1 {
|
||||
timelines.into_iter().collect()
|
||||
} else if !timelines.is_empty() {
|
||||
tree_sort_timelines(timelines)?
|
||||
} else {
|
||||
warn!("No timelines to attach received");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let tenant_id = self.tenant_id;
|
||||
let mut timelines_accessor = self.timelines.lock().unwrap();
|
||||
for (timeline_id, metadata) in sorted_timelines {
|
||||
info!(
|
||||
"Attaching timeline {}/{} pg_version {}",
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
metadata.pg_version()
|
||||
);
|
||||
|
||||
if timelines_accessor.contains_key(&timeline_id) {
|
||||
warn!("Timeline {tenant_id}/{timeline_id} already exists in the tenant map, skipping its initialization");
|
||||
continue;
|
||||
}
|
||||
|
||||
let ancestor = metadata
|
||||
.ancestor_timeline()
|
||||
.and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id))
|
||||
.cloned();
|
||||
let dummy_timeline = self
|
||||
.create_timeline_data(timeline_id, metadata.clone(), ancestor.clone())
|
||||
.with_context(|| {
|
||||
format!("Failed to crate dummy timeline data for {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
let timeline = UninitializedTimeline {
|
||||
owning_tenant: self,
|
||||
timeline_id,
|
||||
raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())),
|
||||
};
|
||||
match timeline.initialize_with_lock(&mut timelines_accessor, true) {
|
||||
Ok(initialized_timeline) => {
|
||||
timelines_accessor.insert(timeline_id, initialized_timeline);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to initialize timeline {tenant_id}/{timeline_id}: {e:?}");
|
||||
let broken_timeline = self
|
||||
.create_timeline_data(timeline_id, metadata, ancestor)
|
||||
.with_context(|| {
|
||||
format!("Failed to crate broken timeline data for {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
broken_timeline.set_state(TimelineState::Broken);
|
||||
timelines_accessor.insert(timeline_id, Arc::new(broken_timeline));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||
@@ -1411,7 +1452,7 @@ fn run_initdb(
|
||||
conf: &'static PageServerConf,
|
||||
initdb_target_dir: &Path,
|
||||
pg_version: u32,
|
||||
) -> Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
let initdb_bin_path = conf.pg_bin_dir(pg_version)?.join("initdb");
|
||||
let initdb_lib_dir = conf.pg_lib_dir(pg_version)?;
|
||||
info!(
|
||||
@@ -1457,7 +1498,7 @@ impl Drop for Tenant {
|
||||
}
|
||||
}
|
||||
/// Dump contents of a layer file to stdout.
|
||||
pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> {
|
||||
pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> anyhow::Result<()> {
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
// All layer files start with a two-byte "magic" value, to identify the kind of
|
||||
@@ -1562,13 +1603,13 @@ pub mod harness {
|
||||
}
|
||||
|
||||
impl<'a> TenantHarness<'a> {
|
||||
pub fn create(test_name: &'static str) -> Result<Self> {
|
||||
pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
|
||||
Self::create_internal(test_name, false)
|
||||
}
|
||||
pub fn create_exclusive(test_name: &'static str) -> Result<Self> {
|
||||
pub fn create_exclusive(test_name: &'static str) -> anyhow::Result<Self> {
|
||||
Self::create_internal(test_name, true)
|
||||
}
|
||||
fn create_internal(test_name: &'static str, exclusive: bool) -> Result<Self> {
|
||||
fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result<Self> {
|
||||
let lock_guard = if exclusive {
|
||||
(None, Some(LOCK.write().unwrap()))
|
||||
} else {
|
||||
@@ -1602,7 +1643,7 @@ pub mod harness {
|
||||
self.try_load().expect("failed to load test tenant")
|
||||
}
|
||||
|
||||
pub fn try_load(&self) -> Result<Tenant> {
|
||||
pub fn try_load(&self) -> anyhow::Result<Tenant> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
let tenant = Tenant::new(
|
||||
@@ -1630,6 +1671,9 @@ pub mod harness {
|
||||
timelines_to_load.insert(timeline_id, timeline_metadata);
|
||||
}
|
||||
tenant.init_attach_timelines(timelines_to_load)?;
|
||||
tenant.set_state(TenantState::Active {
|
||||
background_jobs_running: false,
|
||||
});
|
||||
|
||||
Ok(tenant)
|
||||
}
|
||||
@@ -1682,7 +1726,7 @@ pub mod harness {
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{}", s);
|
||||
println!("{s}");
|
||||
|
||||
Ok(TEST_IMG(&s))
|
||||
}
|
||||
@@ -1706,7 +1750,7 @@ mod tests {
|
||||
Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
|
||||
|
||||
#[test]
|
||||
fn test_basic() -> Result<()> {
|
||||
fn test_basic() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_basic")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -1730,7 +1774,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_duplicate_timelines() -> Result<()> {
|
||||
fn no_duplicate_timelines() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("no_duplicate_timelines")?.load();
|
||||
let _ = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -1761,7 +1805,7 @@ mod tests {
|
||||
/// Test branch creation
|
||||
///
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
fn test_branch() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_branch")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -1789,7 +1833,7 @@ mod tests {
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
|
||||
let newtline = tenant
|
||||
.get_timeline(NEW_TIMELINE_ID)
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
let new_writer = newtline.writer();
|
||||
new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?;
|
||||
@@ -1814,7 +1858,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> Result<()> {
|
||||
fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut lsn = start_lsn;
|
||||
#[allow(non_snake_case)]
|
||||
{
|
||||
@@ -1856,7 +1900,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> {
|
||||
fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let tenant =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
|
||||
.load();
|
||||
@@ -1888,7 +1932,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> {
|
||||
fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
|
||||
let tenant =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load();
|
||||
|
||||
@@ -1915,7 +1959,7 @@ mod tests {
|
||||
// FIXME: This currently fails to error out. Calling GC doesn't currently
|
||||
// remove the old value, we'd need to work a little harder
|
||||
#[test]
|
||||
fn test_prohibit_get_for_garbage_collected_data() -> Result<()> {
|
||||
fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
|
||||
.load();
|
||||
@@ -1935,7 +1979,7 @@ mod tests {
|
||||
*/
|
||||
|
||||
#[test]
|
||||
fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> {
|
||||
fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
|
||||
let tenant =
|
||||
TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
|
||||
let tline = tenant
|
||||
@@ -1945,7 +1989,7 @@ mod tests {
|
||||
|
||||
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
let newtline = tenant
|
||||
.get_timeline(NEW_TIMELINE_ID)
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
@@ -1954,7 +1998,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_parent_keeps_data_forever_after_branching() -> Result<()> {
|
||||
fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
|
||||
let tenant =
|
||||
TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
|
||||
let tline = tenant
|
||||
@@ -1964,7 +2008,7 @@ mod tests {
|
||||
|
||||
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
let newtline = tenant
|
||||
.get_timeline(NEW_TIMELINE_ID)
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
|
||||
@@ -1982,7 +2026,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn timeline_load() -> Result<()> {
|
||||
fn timeline_load() -> anyhow::Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
{
|
||||
@@ -1996,14 +2040,14 @@ mod tests {
|
||||
|
||||
let tenant = harness.load();
|
||||
tenant
|
||||
.get_timeline(TIMELINE_ID)
|
||||
.get_timeline(TIMELINE_ID, true)
|
||||
.expect("cannot load timeline");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn timeline_load_with_ancestor() -> Result<()> {
|
||||
fn timeline_load_with_ancestor() -> anyhow::Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load_with_ancestor";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
// create two timelines
|
||||
@@ -2019,7 +2063,7 @@ mod tests {
|
||||
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
|
||||
let newtline = tenant
|
||||
.get_timeline(NEW_TIMELINE_ID)
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
|
||||
@@ -2031,18 +2075,18 @@ mod tests {
|
||||
|
||||
// check that both, child and ancestor are loaded
|
||||
let _child_tline = tenant
|
||||
.get_timeline(NEW_TIMELINE_ID)
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("cannot get child timeline loaded");
|
||||
|
||||
let _ancestor_tline = tenant
|
||||
.get_timeline(TIMELINE_ID)
|
||||
.get_timeline(TIMELINE_ID, true)
|
||||
.expect("cannot get ancestor timeline loaded");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn corrupt_metadata() -> Result<()> {
|
||||
fn corrupt_metadata() -> anyhow::Result<()> {
|
||||
const TEST_NAME: &str = "corrupt_metadata";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
let tenant = harness.load();
|
||||
@@ -2084,7 +2128,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_images() -> Result<()> {
|
||||
fn test_images() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_images")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2136,7 +2180,7 @@ mod tests {
|
||||
// repeat 50 times.
|
||||
//
|
||||
#[test]
|
||||
fn test_bulk_insert() -> Result<()> {
|
||||
fn test_bulk_insert() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_bulk_insert")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2178,7 +2222,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_random_updates() -> Result<()> {
|
||||
fn test_random_updates() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_random_updates")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2250,7 +2294,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_traverse_branches() -> Result<()> {
|
||||
fn test_traverse_branches() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_traverse_branches")?.load();
|
||||
let mut tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2289,7 +2333,7 @@ mod tests {
|
||||
let new_tline_id = TimelineId::generate();
|
||||
tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
|
||||
tline = tenant
|
||||
.get_timeline(new_tline_id)
|
||||
.get_timeline(new_tline_id, true)
|
||||
.expect("Should have the branched timeline");
|
||||
tline_id = new_tline_id;
|
||||
|
||||
@@ -2331,7 +2375,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_traverse_ancestors() -> Result<()> {
|
||||
fn test_traverse_ancestors() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_traverse_ancestors")?.load();
|
||||
let mut tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2352,7 +2396,7 @@ mod tests {
|
||||
let new_tline_id = TimelineId::generate();
|
||||
tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
|
||||
tline = tenant
|
||||
.get_timeline(new_tline_id)
|
||||
.get_timeline(new_tline_id, true)
|
||||
.expect("Should have the branched timeline");
|
||||
tline_id = new_tline_id;
|
||||
|
||||
|
||||
@@ -610,9 +610,9 @@ impl DeltaLayer {
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
pub struct DeltaLayerWriter {
|
||||
struct DeltaLayerWriterInner {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
pub path: PathBuf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
|
||||
@@ -624,17 +624,17 @@ pub struct DeltaLayerWriter {
|
||||
blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriter {
|
||||
impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
pub fn new(
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
) -> Result<DeltaLayerWriter> {
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename. We don't know
|
||||
// the end key yet, so we cannot form the final filename yet. We will
|
||||
// rename it when we're done.
|
||||
@@ -653,7 +653,7 @@ impl DeltaLayerWriter {
|
||||
let block_buf = BlockBuf::new();
|
||||
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
||||
|
||||
Ok(DeltaLayerWriter {
|
||||
Ok(Self {
|
||||
conf,
|
||||
path,
|
||||
timeline_id,
|
||||
@@ -670,17 +670,17 @@ impl DeltaLayerWriter {
|
||||
///
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||
fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
|
||||
}
|
||||
|
||||
pub fn put_value_bytes(
|
||||
fn put_value_bytes(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: &[u8],
|
||||
will_init: bool,
|
||||
) -> Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(self.lsn_range.start <= lsn);
|
||||
|
||||
let off = self.blob_writer.write_blob(val)?;
|
||||
@@ -693,14 +693,14 @@ impl DeltaLayerWriter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
fn size(&self) -> u64 {
|
||||
self.blob_writer.size() + self.tree.borrow_writer().size()
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -768,6 +768,102 @@ impl DeltaLayerWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new delta layer.
|
||||
///
|
||||
/// Usage:
|
||||
///
|
||||
/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
|
||||
///
|
||||
/// 2. Write the contents by calling `put_value` for every page
|
||||
/// version to store in the layer.
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
||||
/// possible for the writer to drop before `finish` is actually called. So this
|
||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||
/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
|
||||
/// implementation that cleans up the temporary file in failure. It's not
|
||||
/// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves
|
||||
/// out some fields, making it impossible to implement `Drop`.
|
||||
///
|
||||
#[must_use]
|
||||
pub struct DeltaLayerWriter {
|
||||
inner: Option<DeltaLayerWriterInner>,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriter {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
inner: Some(DeltaLayerWriterInner::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_start,
|
||||
lsn_range,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Append a key-value pair to the file.
|
||||
///
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_value(key, lsn, val)
|
||||
}
|
||||
|
||||
pub fn put_value_bytes(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: &[u8],
|
||||
will_init: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
self.inner
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.put_value_bytes(key, lsn, val, will_init)
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
self.inner.as_ref().unwrap().size()
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
self.inner.take().unwrap().finish(key_end)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DeltaLayerWriter {
|
||||
fn drop(&mut self) {
|
||||
if let Some(inner) = self.inner.take() {
|
||||
match inner.blob_writer.into_inner().into_inner() {
|
||||
Ok(vfile) => vfile.remove(),
|
||||
Err(err) => warn!(
|
||||
"error while flushing buffer of image layer temporary file: {}",
|
||||
err
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator over all key-value pairse stored in a delta layer
|
||||
///
|
||||
|
||||
@@ -411,7 +411,7 @@ impl ImageLayer {
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
pub struct ImageLayerWriter {
|
||||
struct ImageLayerWriterInner {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
timeline_id: TimelineId,
|
||||
@@ -423,14 +423,17 @@ pub struct ImageLayerWriter {
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
}
|
||||
|
||||
impl ImageLayerWriter {
|
||||
pub fn new(
|
||||
impl ImageLayerWriterInner {
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename.
|
||||
// We'll atomically rename it to the final name when we're done.
|
||||
let path = ImageLayer::temp_path_for(
|
||||
@@ -455,7 +458,7 @@ impl ImageLayerWriter {
|
||||
let block_buf = BlockBuf::new();
|
||||
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
||||
|
||||
let writer = ImageLayerWriter {
|
||||
let writer = Self {
|
||||
conf,
|
||||
path,
|
||||
timeline_id,
|
||||
@@ -474,7 +477,7 @@ impl ImageLayerWriter {
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
|
||||
fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
let off = self.blob_writer.write_blob(img)?;
|
||||
|
||||
@@ -485,7 +488,10 @@ impl ImageLayerWriter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self) -> anyhow::Result<ImageLayer> {
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
fn finish(self) -> anyhow::Result<ImageLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -552,3 +558,76 @@ impl ImageLayerWriter {
|
||||
Ok(layer)
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new image layer.
|
||||
///
|
||||
/// Usage:
|
||||
///
|
||||
/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
|
||||
///
|
||||
/// 2. Write the contents by calling `put_page_image` for every key-value
|
||||
/// pair in the key range.
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
||||
/// possible for the writer to drop before `finish` is actually called. So this
|
||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||
/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
|
||||
/// implementation that cleans up the temporary file in failure. It's not
|
||||
/// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
|
||||
/// out some fields, making it impossible to implement `Drop`.
|
||||
///
|
||||
#[must_use]
|
||||
pub struct ImageLayerWriter {
|
||||
inner: Option<ImageLayerWriterInner>,
|
||||
}
|
||||
|
||||
impl ImageLayerWriter {
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
Ok(Self {
|
||||
inner: Some(ImageLayerWriterInner::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_range,
|
||||
lsn,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Write next value to the file.
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_image(key, img)
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
|
||||
self.inner.take().unwrap().finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ImageLayerWriter {
|
||||
fn drop(&mut self) {
|
||||
if let Some(inner) = self.inner.take() {
|
||||
inner.blob_writer.into_inner().remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
//!
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use bytes::Bytes;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use tokio::sync::watch;
|
||||
use tokio::task::spawn_blocking;
|
||||
use tracing::*;
|
||||
|
||||
@@ -35,8 +37,8 @@ use crate::metrics::TimelineMetrics;
|
||||
use crate::pgdatadir_mapping::BlockNumber;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
use crate::reltag::RelTag;
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use pageserver_api::reltag::RelTag;
|
||||
|
||||
use postgres_ffi::to_pg_timestamp;
|
||||
use utils::{
|
||||
@@ -160,6 +162,8 @@ pub struct Timeline {
|
||||
|
||||
/// Relation size cache
|
||||
pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
|
||||
|
||||
state: watch::Sender<TimelineState>,
|
||||
}
|
||||
|
||||
/// Internal structure to hold all data needed for logical size calculation.
|
||||
@@ -307,10 +311,6 @@ pub struct GcInfo {
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
//------------------------------------------------------------------------------
|
||||
// Public GET functions
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Get the LSN where this branch was created
|
||||
pub fn get_ancestor_lsn(&self) -> Lsn {
|
||||
self.ancestor_lsn
|
||||
@@ -420,9 +420,11 @@ impl Timeline {
|
||||
/// those functions with an LSN that has been processed yet is an error.
|
||||
///
|
||||
pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
|
||||
|
||||
// This should never be called from the WAL receiver, because that could lead
|
||||
// to a deadlock.
|
||||
ensure!(
|
||||
anyhow::ensure!(
|
||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection),
|
||||
"wait_lsn cannot be called in WAL receiver"
|
||||
);
|
||||
@@ -445,7 +447,7 @@ impl Timeline {
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
) -> Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
lsn >= **latest_gc_cutoff_lsn,
|
||||
"LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
|
||||
@@ -455,12 +457,6 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Public PUT functions, to update the repository with new page versions.
|
||||
//
|
||||
// These are called by the WAL receiver to digest WAL records.
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
@@ -479,6 +475,91 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn compact(&self) -> anyhow::Result<()> {
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
|
||||
// Last record Lsn could be zero in case the timelie was just created
|
||||
if !last_record_lsn.is_valid() {
|
||||
warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
//
|
||||
// High level strategy for compaction / image creation:
|
||||
//
|
||||
// 1. First, calculate the desired "partitioning" of the
|
||||
// currently in-use key space. The goal is to partition the
|
||||
// key space into roughly fixed-size chunks, but also take into
|
||||
// account any existing image layers, and try to align the
|
||||
// chunk boundaries with the existing image layers to avoid
|
||||
// too much churn. Also try to align chunk boundaries with
|
||||
// relation boundaries. In principle, we don't know about
|
||||
// relation boundaries here, we just deal with key-value
|
||||
// pairs, and the code in pgdatadir_mapping.rs knows how to
|
||||
// map relations into key-value pairs. But in practice we know
|
||||
// that 'field6' is the block number, and the fields 1-5
|
||||
// identify a relation. This is just an optimization,
|
||||
// though.
|
||||
//
|
||||
// 2. Once we know the partitioning, for each partition,
|
||||
// decide if it's time to create a new image layer. The
|
||||
// criteria is: there has been too much "churn" since the last
|
||||
// image layer? The "churn" is fuzzy concept, it's a
|
||||
// combination of too many delta files, or too much WAL in
|
||||
// total in the delta file. Or perhaps: if creating an image
|
||||
// file would allow to delete some older files.
|
||||
//
|
||||
// 3. After that, we compact all level0 delta files if there
|
||||
// are too many of them. While compacting, we also garbage
|
||||
// collect any page versions that are no longer needed because
|
||||
// of the new image layers we created in step 2.
|
||||
//
|
||||
// TODO: This high level strategy hasn't been implemented yet.
|
||||
// Below are functions compact_level0() and create_image_layers()
|
||||
// but they are a bit ad hoc and don't quite work like it's explained
|
||||
// above. Rewrite it.
|
||||
let _layer_removal_cs = self.layer_removal_cs.lock().unwrap();
|
||||
|
||||
let target_file_size = self.get_checkpoint_distance();
|
||||
|
||||
// Define partitioning schema if needed
|
||||
|
||||
match self.repartition(
|
||||
self.get_last_record_lsn(),
|
||||
self.get_compaction_target_size(),
|
||||
) {
|
||||
Ok((partitioning, lsn)) => {
|
||||
// 2. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
|
||||
if !layer_paths_to_upload.is_empty()
|
||||
&& self.upload_layers.load(atomic::Ordering::Relaxed)
|
||||
{
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
layer_paths_to_upload,
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
// 3. Compact
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
self.compact_level0(target_file_size)?;
|
||||
timer.stop_and_record();
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline
|
||||
// as a simple key-value store, ignoring the datadir layout. Log the
|
||||
// error but continue.
|
||||
error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
}
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Mutate the timeline with a [`TimelineWriter`].
|
||||
pub fn writer(&self) -> TimelineWriter<'_> {
|
||||
TimelineWriter {
|
||||
@@ -486,6 +567,109 @@ impl Timeline {
|
||||
_write_guard: self.write_lock.lock().unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve current logical size of the timeline.
|
||||
///
|
||||
/// The size could be lagging behind the actual number, in case
|
||||
/// the initial size calculation has not been run (gets triggered on the first size access).
|
||||
pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
|
||||
let current_size = self.current_logical_size.current_size()?;
|
||||
debug!("Current size: {current_size:?}");
|
||||
|
||||
let size = current_size.size();
|
||||
if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
|
||||
(current_size, self.current_logical_size.initial_part_end)
|
||||
{
|
||||
self.try_spawn_size_init_task(init_lsn);
|
||||
}
|
||||
|
||||
Ok(size)
|
||||
}
|
||||
|
||||
/// Check if more than 'checkpoint_distance' of WAL has been accumulated in
|
||||
/// the in-memory layer, and initiate flushing it if so.
|
||||
///
|
||||
/// Also flush after a period of time without new data -- it helps
|
||||
/// safekeepers to regard pageserver as caught up and suspend activity.
|
||||
pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
|
||||
let last_lsn = self.get_last_record_lsn();
|
||||
let layers = self.layers.read().unwrap();
|
||||
if let Some(open_layer) = &layers.open_layer {
|
||||
let open_layer_size = open_layer.size()?;
|
||||
drop(layers);
|
||||
let last_freeze_at = self.last_freeze_at.load();
|
||||
let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
|
||||
let distance = last_lsn.widening_sub(last_freeze_at);
|
||||
// Checkpointing the open layer can be triggered by layer size or LSN range.
|
||||
// S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
|
||||
// we want to stay below that with a big margin. The LSN distance determines how
|
||||
// much WAL the safekeepers need to store.
|
||||
if distance >= self.get_checkpoint_distance().into()
|
||||
|| open_layer_size > self.get_checkpoint_distance()
|
||||
|| (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
|
||||
{
|
||||
info!(
|
||||
"check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
|
||||
distance,
|
||||
open_layer_size,
|
||||
last_freeze_ts.elapsed()
|
||||
);
|
||||
|
||||
self.freeze_inmem_layer(true);
|
||||
self.last_freeze_at.store(last_lsn);
|
||||
*(self.last_freeze_ts.write().unwrap()) = Instant::now();
|
||||
|
||||
// Launch a task to flush the frozen layer to disk, unless
|
||||
// a task was already running. (If the task was running
|
||||
// at the time that we froze the layer, it must've seen the
|
||||
// the layer we just froze before it exited; see comments
|
||||
// in flush_frozen_layers())
|
||||
if let Ok(guard) = self.layer_flush_lock.try_lock() {
|
||||
drop(guard);
|
||||
let self_clone = Arc::clone(self);
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::LayerFlushTask,
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
"layer flush task",
|
||||
false,
|
||||
async move { self_clone.flush_frozen_layers(false) },
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn set_state(&self, new_state: TimelineState) {
|
||||
match (self.current_state(), new_state) {
|
||||
(equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
|
||||
debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
|
||||
}
|
||||
(TimelineState::Broken, _) => {
|
||||
error!("Ignoring state update {new_state:?} for broken tenant");
|
||||
}
|
||||
(TimelineState::Paused, TimelineState::Active) => {
|
||||
debug!("Not activating a paused timeline");
|
||||
}
|
||||
(_, new_state) => {
|
||||
self.state.send_replace(new_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn current_state(&self) -> TimelineState {
|
||||
*self.state.borrow()
|
||||
}
|
||||
|
||||
pub fn is_active(&self) -> bool {
|
||||
self.current_state() == TimelineState::Active
|
||||
}
|
||||
|
||||
pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
|
||||
self.state.subscribe()
|
||||
}
|
||||
}
|
||||
|
||||
// Private functions
|
||||
@@ -529,7 +713,7 @@ impl Timeline {
|
||||
///
|
||||
/// Loads the metadata for the timeline into memory, but not the layer map.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
pub(super) fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
metadata: TimelineMetadata,
|
||||
@@ -539,8 +723,9 @@ impl Timeline {
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
upload_layers: bool,
|
||||
pg_version: u32,
|
||||
) -> Timeline {
|
||||
) -> Self {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
let (state, _) = watch::channel(TimelineState::Suspended);
|
||||
|
||||
let mut result = Timeline {
|
||||
conf,
|
||||
@@ -597,12 +782,13 @@ impl Timeline {
|
||||
|
||||
last_received_wal: Mutex::new(None),
|
||||
rel_size_cache: RwLock::new(HashMap::new()),
|
||||
state,
|
||||
};
|
||||
result.repartition_threshold = result.get_checkpoint_distance() / 10;
|
||||
result
|
||||
}
|
||||
|
||||
pub fn launch_wal_receiver(self: &Arc<Self>) {
|
||||
pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
|
||||
if !is_etcd_client_initialized() {
|
||||
if cfg!(test) {
|
||||
info!("not launching WAL receiver because etcd client hasn't been initialized");
|
||||
@@ -641,7 +827,7 @@ impl Timeline {
|
||||
/// Scan the timeline directory to populate the layer map.
|
||||
/// Returns all timeline-related files that were found and loaded.
|
||||
///
|
||||
pub fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
||||
pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let mut num_layers = 0;
|
||||
|
||||
@@ -727,33 +913,13 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
|
||||
pub(super) fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
|
||||
self.layer_removal_cs
|
||||
.try_lock()
|
||||
.map_err(|e| anyhow!("cannot lock compaction critical section {e}"))
|
||||
}
|
||||
|
||||
/// Retrieve current logical size of the timeline.
|
||||
///
|
||||
/// The size could be lagging behind the actual number, in case
|
||||
/// the initial size calculation has not been run (gets triggered on the first size access).
|
||||
pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
|
||||
let current_size = self.current_logical_size.current_size()?;
|
||||
debug!("Current size: {current_size:?}");
|
||||
|
||||
let size = current_size.size();
|
||||
if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
|
||||
(current_size, self.current_logical_size.initial_part_end)
|
||||
{
|
||||
self.try_spawn_size_init_task(init_lsn);
|
||||
}
|
||||
|
||||
Ok(size)
|
||||
}
|
||||
|
||||
fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
|
||||
let timeline_id = self.timeline_id;
|
||||
|
||||
// Atomically check if the timeline size calculation had already started.
|
||||
// If the flag was not already set, this sets it.
|
||||
if !self
|
||||
@@ -770,17 +936,42 @@ impl Timeline {
|
||||
"initial size calculation",
|
||||
false,
|
||||
async move {
|
||||
let calculated_size = self_clone.calculate_logical_size(init_lsn)?;
|
||||
let result = spawn_blocking(move || {
|
||||
self_clone.current_logical_size.initial_logical_size.set(calculated_size)
|
||||
}).await?;
|
||||
match result {
|
||||
Ok(()) => info!("Successfully calculated initial logical size"),
|
||||
Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"),
|
||||
let mut timeline_state_updates = self_clone.subscribe_for_state_updates();
|
||||
let self_calculation = Arc::clone(&self_clone);
|
||||
tokio::select! {
|
||||
calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => {
|
||||
let calculated_size = calculation_result
|
||||
.context("Failed to spawn calculation result task")?
|
||||
.context("Failed to calculate logical size")?;
|
||||
match self_clone.current_logical_size.initial_logical_size.set(calculated_size) {
|
||||
Ok(()) => info!("Successfully calculated initial logical size"),
|
||||
Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"),
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
new_event = async {
|
||||
loop {
|
||||
match timeline_state_updates.changed().await {
|
||||
Ok(()) => {
|
||||
let new_state = *timeline_state_updates.borrow();
|
||||
match new_state {
|
||||
// we're running this job for active timelines only
|
||||
TimelineState::Active => continue,
|
||||
TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return Some(new_state),
|
||||
}
|
||||
}
|
||||
Err(_sender_dropped_error) => return None,
|
||||
}
|
||||
}
|
||||
} => {
|
||||
match new_event {
|
||||
Some(new_state) => info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"),
|
||||
None => info!("Timeline dropped state updates sender, stopping init size calculation"),
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
.instrument(info_span!("initial_logical_size_calculation", timeline = %timeline_id))
|
||||
}.instrument(info_span!("initial_logical_size_calculation", tenant = %self.tenant_id, timeline = %self.timeline_id)),
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -971,7 +1162,7 @@ impl Timeline {
|
||||
Some((lsn, img))
|
||||
}
|
||||
|
||||
fn get_ancestor_timeline(&self) -> Result<Arc<Timeline>> {
|
||||
fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
|
||||
let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
|
||||
format!(
|
||||
"Ancestor is missing. Timeline id: {} Ancestor id {:?}",
|
||||
@@ -1030,14 +1221,14 @@ impl Timeline {
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||
fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
|
||||
//info!("PUT: key {} at {}", key, lsn);
|
||||
let layer = self.get_layer_for_write(lsn)?;
|
||||
layer.put_value(key, lsn, val)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()> {
|
||||
fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
|
||||
let layer = self.get_layer_for_write(lsn)?;
|
||||
layer.put_tombstone(key_range, lsn)?;
|
||||
|
||||
@@ -1076,64 +1267,6 @@ impl Timeline {
|
||||
drop(layers);
|
||||
}
|
||||
|
||||
///
|
||||
/// Check if more than 'checkpoint_distance' of WAL has been accumulated in
|
||||
/// the in-memory layer, and initiate flushing it if so.
|
||||
///
|
||||
/// Also flush after a period of time without new data -- it helps
|
||||
/// safekeepers to regard pageserver as caught up and suspend activity.
|
||||
///
|
||||
pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> Result<()> {
|
||||
let last_lsn = self.get_last_record_lsn();
|
||||
let layers = self.layers.read().unwrap();
|
||||
if let Some(open_layer) = &layers.open_layer {
|
||||
let open_layer_size = open_layer.size()?;
|
||||
drop(layers);
|
||||
let last_freeze_at = self.last_freeze_at.load();
|
||||
let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
|
||||
let distance = last_lsn.widening_sub(last_freeze_at);
|
||||
// Checkpointing the open layer can be triggered by layer size or LSN range.
|
||||
// S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
|
||||
// we want to stay below that with a big margin. The LSN distance determines how
|
||||
// much WAL the safekeepers need to store.
|
||||
if distance >= self.get_checkpoint_distance().into()
|
||||
|| open_layer_size > self.get_checkpoint_distance()
|
||||
|| (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
|
||||
{
|
||||
info!(
|
||||
"check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
|
||||
distance,
|
||||
open_layer_size,
|
||||
last_freeze_ts.elapsed()
|
||||
);
|
||||
|
||||
self.freeze_inmem_layer(true);
|
||||
self.last_freeze_at.store(last_lsn);
|
||||
*(self.last_freeze_ts.write().unwrap()) = Instant::now();
|
||||
|
||||
// Launch a task to flush the frozen layer to disk, unless
|
||||
// a task was already running. (If the task was running
|
||||
// at the time that we froze the layer, it must've seen the
|
||||
// the layer we just froze before it exited; see comments
|
||||
// in flush_frozen_layers())
|
||||
if let Ok(guard) = self.layer_flush_lock.try_lock() {
|
||||
drop(guard);
|
||||
let self_clone = Arc::clone(self);
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::LayerFlushTask,
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
"layer flush task",
|
||||
false,
|
||||
async move { self_clone.flush_frozen_layers(false) },
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Flush all frozen layers to disk.
|
||||
///
|
||||
/// Only one task at a time can be doing layer-flushing for a
|
||||
@@ -1141,7 +1274,7 @@ impl Timeline {
|
||||
/// currently doing the flushing, this function will wait for it
|
||||
/// to finish. If 'wait' is false, this function will return
|
||||
/// immediately instead.
|
||||
fn flush_frozen_layers(&self, wait: bool) -> Result<()> {
|
||||
fn flush_frozen_layers(&self, wait: bool) -> anyhow::Result<()> {
|
||||
let flush_lock_guard = if wait {
|
||||
self.layer_flush_lock.lock().unwrap()
|
||||
} else {
|
||||
@@ -1180,7 +1313,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||
fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> Result<()> {
|
||||
fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
|
||||
// As a special case, when we have just imported an image into the repository,
|
||||
// instead of writing out a L0 delta layer, we directly write out image layer
|
||||
// files instead. This is possible as long as *all* the data imported into the
|
||||
@@ -1238,7 +1371,7 @@ impl Timeline {
|
||||
&self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
layer_paths_to_upload: HashMap<PathBuf, LayerFileMetadata>,
|
||||
) -> Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
// We can only save a valid 'prev_record_lsn' value on disk if we
|
||||
// flushed *all* in-memory changes to disk. We only track
|
||||
// 'prev_record_lsn' in memory for the latest processed record, so we
|
||||
@@ -1283,7 +1416,7 @@ impl Timeline {
|
||||
false,
|
||||
)?;
|
||||
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
if self.can_upload_layers() {
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
@@ -1299,7 +1432,7 @@ impl Timeline {
|
||||
fn create_delta_layer(
|
||||
&self,
|
||||
frozen_layer: &InMemoryLayer,
|
||||
) -> Result<(PathBuf, LayerFileMetadata)> {
|
||||
) -> anyhow::Result<(PathBuf, LayerFileMetadata)> {
|
||||
// Write it out
|
||||
let new_delta = frozen_layer.write_to_disk()?;
|
||||
let new_delta_path = new_delta.path();
|
||||
@@ -1334,92 +1467,7 @@ impl Timeline {
|
||||
Ok((new_delta_path, LayerFileMetadata::new(sz)))
|
||||
}
|
||||
|
||||
pub fn compact(&self) -> anyhow::Result<()> {
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
|
||||
// Last record Lsn could be zero in case the timelie was just created
|
||||
if !last_record_lsn.is_valid() {
|
||||
warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
//
|
||||
// High level strategy for compaction / image creation:
|
||||
//
|
||||
// 1. First, calculate the desired "partitioning" of the
|
||||
// currently in-use key space. The goal is to partition the
|
||||
// key space into roughly fixed-size chunks, but also take into
|
||||
// account any existing image layers, and try to align the
|
||||
// chunk boundaries with the existing image layers to avoid
|
||||
// too much churn. Also try to align chunk boundaries with
|
||||
// relation boundaries. In principle, we don't know about
|
||||
// relation boundaries here, we just deal with key-value
|
||||
// pairs, and the code in pgdatadir_mapping.rs knows how to
|
||||
// map relations into key-value pairs. But in practice we know
|
||||
// that 'field6' is the block number, and the fields 1-5
|
||||
// identify a relation. This is just an optimization,
|
||||
// though.
|
||||
//
|
||||
// 2. Once we know the partitioning, for each partition,
|
||||
// decide if it's time to create a new image layer. The
|
||||
// criteria is: there has been too much "churn" since the last
|
||||
// image layer? The "churn" is fuzzy concept, it's a
|
||||
// combination of too many delta files, or too much WAL in
|
||||
// total in the delta file. Or perhaps: if creating an image
|
||||
// file would allow to delete some older files.
|
||||
//
|
||||
// 3. After that, we compact all level0 delta files if there
|
||||
// are too many of them. While compacting, we also garbage
|
||||
// collect any page versions that are no longer needed because
|
||||
// of the new image layers we created in step 2.
|
||||
//
|
||||
// TODO: This high level strategy hasn't been implemented yet.
|
||||
// Below are functions compact_level0() and create_image_layers()
|
||||
// but they are a bit ad hoc and don't quite work like it's explained
|
||||
// above. Rewrite it.
|
||||
let _layer_removal_cs = self.layer_removal_cs.lock().unwrap();
|
||||
|
||||
let target_file_size = self.get_checkpoint_distance();
|
||||
|
||||
// Define partitioning schema if needed
|
||||
|
||||
match self.repartition(
|
||||
self.get_last_record_lsn(),
|
||||
self.get_compaction_target_size(),
|
||||
) {
|
||||
Ok((partitioning, lsn)) => {
|
||||
// 2. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
|
||||
if !layer_paths_to_upload.is_empty()
|
||||
&& self.upload_layers.load(atomic::Ordering::Relaxed)
|
||||
{
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
layer_paths_to_upload,
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
// 3. Compact
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
self.compact_level0(target_file_size)?;
|
||||
timer.stop_and_record();
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline
|
||||
// as a simple key-value store, ignoring the datadir layout. Log the
|
||||
// error but continue.
|
||||
error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
}
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> {
|
||||
fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> {
|
||||
let mut partitioning_guard = self.partitioning.lock().unwrap();
|
||||
if partitioning_guard.1 == Lsn(0)
|
||||
|| lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold
|
||||
@@ -1433,7 +1481,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result<bool> {
|
||||
fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
|
||||
let layers = self.layers.read().unwrap();
|
||||
|
||||
for part_range in &partition.ranges {
|
||||
@@ -1478,7 +1526,7 @@ impl Timeline {
|
||||
partitioning: &KeyPartitioning,
|
||||
lsn: Lsn,
|
||||
force: bool,
|
||||
) -> Result<HashMap<PathBuf, LayerFileMetadata>> {
|
||||
) -> anyhow::Result<HashMap<PathBuf, LayerFileMetadata>> {
|
||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||
let mut image_layers: Vec<ImageLayer> = Vec::new();
|
||||
for partition in partitioning.parts.iter() {
|
||||
@@ -1493,6 +1541,10 @@ impl Timeline {
|
||||
lsn,
|
||||
)?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
anyhow::bail!("failpoint image-layer-writer-fail-before-finish");
|
||||
});
|
||||
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
@@ -1571,7 +1623,7 @@ impl Timeline {
|
||||
/// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
|
||||
/// as Level 1 files.
|
||||
///
|
||||
fn compact_level0(&self, target_file_size: u64) -> Result<()> {
|
||||
fn compact_level0(&self, target_file_size: u64) -> anyhow::Result<()> {
|
||||
let layers = self.layers.read().unwrap();
|
||||
let mut level0_deltas = layers.get_level0_deltas()?;
|
||||
drop(layers);
|
||||
@@ -1787,6 +1839,11 @@ impl Timeline {
|
||||
},
|
||||
)?);
|
||||
}
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
}
|
||||
@@ -1838,7 +1895,7 @@ impl Timeline {
|
||||
}
|
||||
drop(layers);
|
||||
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
if self.can_upload_layers() {
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
@@ -1881,12 +1938,12 @@ impl Timeline {
|
||||
///
|
||||
/// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
|
||||
/// whether a record is needed for PITR.
|
||||
pub fn update_gc_info(
|
||||
pub(super) fn update_gc_info(
|
||||
&self,
|
||||
retain_lsns: Vec<Lsn>,
|
||||
cutoff_horizon: Lsn,
|
||||
pitr: Duration,
|
||||
) -> Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
let mut gc_info = self.gc_info.write().unwrap();
|
||||
|
||||
gc_info.horizon_cutoff = cutoff_horizon;
|
||||
@@ -1941,8 +1998,8 @@ impl Timeline {
|
||||
/// within a layer file. We can only remove the whole file if it's fully
|
||||
/// obsolete.
|
||||
///
|
||||
pub fn gc(&self) -> Result<GcResult> {
|
||||
let mut result: GcResult = Default::default();
|
||||
pub(super) fn gc(&self) -> anyhow::Result<GcResult> {
|
||||
let mut result: GcResult = GcResult::default();
|
||||
let now = SystemTime::now();
|
||||
|
||||
fail_point!("before-timeline-gc");
|
||||
@@ -2122,7 +2179,7 @@ impl Timeline {
|
||||
fail_point!("after-timeline-gc-removed-layers");
|
||||
}
|
||||
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
if self.can_upload_layers() {
|
||||
storage_sync::schedule_layer_delete(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
@@ -2211,6 +2268,11 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn can_upload_layers(&self) -> bool {
|
||||
self.upload_layers.load(atomic::Ordering::Relaxed)
|
||||
&& self.current_state() != TimelineState::Broken
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function for get_reconstruct_data() to add the path of layers traversed
|
||||
@@ -2261,11 +2323,11 @@ impl<'a> TimelineWriter<'a> {
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> {
|
||||
pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
|
||||
self.tl.put_value(key, lsn, value)
|
||||
}
|
||||
|
||||
pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()> {
|
||||
pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
|
||||
self.tl.put_tombstone(key_range, lsn)
|
||||
}
|
||||
|
||||
|
||||
@@ -175,7 +175,7 @@ async fn wait_for_active_tenant(
|
||||
}
|
||||
state => {
|
||||
debug!("Not running the task loop, tenant is not active with background jobs enabled: {state:?}");
|
||||
tokio::time::sleep(wait).await;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -319,6 +319,12 @@ impl VirtualFile {
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn remove(self) {
|
||||
let path = self.path.clone();
|
||||
drop(self);
|
||||
std::fs::remove_file(path).expect("failed to remove the virtual file");
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for VirtualFile {
|
||||
|
||||
@@ -31,10 +31,10 @@ use bytes::{Buf, Bytes, BytesMut};
|
||||
use tracing::*;
|
||||
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walrecord::*;
|
||||
use crate::ZERO_PAGE;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
|
||||
|
||||
@@ -155,22 +155,19 @@ impl<E: Clone> TaskHandle<E> {
|
||||
|
||||
/// Aborts current task, waiting for it to finish.
|
||||
pub async fn shutdown(self) {
|
||||
match self.join_handle {
|
||||
Some(jh) => {
|
||||
self.cancellation.send(()).ok();
|
||||
match jh.await {
|
||||
Ok(Ok(())) => debug!("Shutdown success"),
|
||||
Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
|
||||
Err(join_error) => {
|
||||
if join_error.is_cancelled() {
|
||||
error!("Shutdown task was cancelled");
|
||||
} else {
|
||||
error!("Shutdown task join error: {join_error}")
|
||||
}
|
||||
if let Some(jh) = self.join_handle {
|
||||
self.cancellation.send(()).ok();
|
||||
match jh.await {
|
||||
Ok(Ok(())) => debug!("Shutdown success"),
|
||||
Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
|
||||
Err(join_error) => {
|
||||
if join_error.is_cancelled() {
|
||||
error!("Shutdown task was cancelled");
|
||||
} else {
|
||||
error!("Shutdown task join error: {join_error}")
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
use std::{
|
||||
collections::{hash_map, HashMap},
|
||||
num::NonZeroU64,
|
||||
ops::ControlFlow,
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
@@ -26,7 +27,8 @@ use etcd_broker::{
|
||||
subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription,
|
||||
BrokerUpdate, Client,
|
||||
};
|
||||
use tokio::select;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use tokio::{select, sync::watch};
|
||||
use tracing::*;
|
||||
|
||||
use crate::{
|
||||
@@ -58,10 +60,7 @@ pub fn spawn_connection_manager_task(
|
||||
TaskKind::WalReceiverManager,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
&format!(
|
||||
"walreceiver for tenant {} timeline {}",
|
||||
timeline.tenant_id, timeline.timeline_id
|
||||
),
|
||||
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
info!("WAL receiver broker started, connecting to etcd");
|
||||
@@ -75,24 +74,26 @@ pub fn spawn_connection_manager_task(
|
||||
select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("WAL receiver shutdown requested, shutting down");
|
||||
// Kill current connection, if any
|
||||
if let Some(wal_connection) = walreceiver_state.wal_connection.take()
|
||||
{
|
||||
wal_connection.connection_task.shutdown().await;
|
||||
}
|
||||
walreceiver_state.shutdown().await;
|
||||
return Ok(());
|
||||
},
|
||||
|
||||
_ = connection_manager_loop_step(
|
||||
loop_step_result = connection_manager_loop_step(
|
||||
&broker_loop_prefix,
|
||||
&mut etcd_client,
|
||||
&mut walreceiver_state,
|
||||
) => {},
|
||||
) => match loop_step_result {
|
||||
ControlFlow::Continue(()) => continue,
|
||||
ControlFlow::Break(()) => {
|
||||
info!("Connection manager loop ended, shutting down");
|
||||
walreceiver_state.shutdown().await;
|
||||
return Ok(());
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
.instrument(
|
||||
info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
|
||||
info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
|
||||
),
|
||||
);
|
||||
}
|
||||
@@ -104,7 +105,17 @@ async fn connection_manager_loop_step(
|
||||
broker_prefix: &str,
|
||||
etcd_client: &mut Client,
|
||||
walreceiver_state: &mut WalreceiverState,
|
||||
) {
|
||||
) -> ControlFlow<(), ()> {
|
||||
let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
|
||||
|
||||
match wait_for_active_timeline(&mut timeline_state_updates).await {
|
||||
ControlFlow::Continue(()) => {}
|
||||
ControlFlow::Break(()) => {
|
||||
info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
|
||||
let id = TenantTimelineId {
|
||||
tenant_id: walreceiver_state.timeline.tenant_id,
|
||||
timeline_id: walreceiver_state.timeline.timeline_id,
|
||||
@@ -129,10 +140,12 @@ async fn connection_manager_loop_step(
|
||||
// - change connection if the rules decide so, or if the current connection dies
|
||||
// - receive updates from broker
|
||||
// - this might change the current desired connection
|
||||
// - timeline state changes to something that does not allow walreceiver to run concurrently
|
||||
select! {
|
||||
broker_connection_result = &mut broker_subscription.watcher_handle => {
|
||||
info!("Broker connection was closed from the other side, ending current broker loop step");
|
||||
cleanup_broker_connection(broker_connection_result, walreceiver_state);
|
||||
return;
|
||||
return ControlFlow::Continue(());
|
||||
},
|
||||
|
||||
Some(wal_connection_update) = async {
|
||||
@@ -185,11 +198,36 @@ async fn connection_manager_loop_step(
|
||||
(&mut broker_subscription.watcher_handle).await,
|
||||
walreceiver_state,
|
||||
);
|
||||
return;
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
new_event = async {
|
||||
loop {
|
||||
match timeline_state_updates.changed().await {
|
||||
Ok(()) => {
|
||||
let new_state = walreceiver_state.timeline.current_state();
|
||||
match new_state {
|
||||
// we're already active as walreceiver, no need to reactivate
|
||||
TimelineState::Active => continue,
|
||||
TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return ControlFlow::Continue(new_state),
|
||||
}
|
||||
}
|
||||
Err(_sender_dropped_error) => return ControlFlow::Break(()),
|
||||
}
|
||||
}
|
||||
} => match new_event {
|
||||
ControlFlow::Continue(new_state) => {
|
||||
info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
ControlFlow::Break(()) => {
|
||||
info!("Timeline dropped state updates sender, stopping wal connection manager loop");
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
},
|
||||
|
||||
_ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {}
|
||||
}
|
||||
|
||||
@@ -216,6 +254,34 @@ async fn connection_manager_loop_step(
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_active_timeline(
|
||||
timeline_state_updates: &mut watch::Receiver<TimelineState>,
|
||||
) -> ControlFlow<(), ()> {
|
||||
let current_state = *timeline_state_updates.borrow();
|
||||
if current_state == TimelineState::Active {
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
|
||||
loop {
|
||||
match timeline_state_updates.changed().await {
|
||||
Ok(()) => {
|
||||
let new_state = *timeline_state_updates.borrow();
|
||||
match new_state {
|
||||
TimelineState::Active => {
|
||||
debug!("Timeline state changed to active, continuing the walreceiver connection manager");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
state => {
|
||||
debug!("Not running the walreceiver connection manager, timeline is not active: {state:?}");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_sender_dropped_error) => return ControlFlow::Break(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn cleanup_broker_connection(
|
||||
broker_connection_result: Result<Result<(), etcd_broker::BrokerError>, tokio::task::JoinError>,
|
||||
walreceiver_state: &mut WalreceiverState,
|
||||
@@ -723,6 +789,12 @@ impl WalreceiverState {
|
||||
self.wal_connection_retries.remove(&node_id);
|
||||
}
|
||||
}
|
||||
|
||||
async fn shutdown(mut self) {
|
||||
if let Some(wal_connection) = self.wal_connection.take() {
|
||||
wal_connection.connection_task.shutdown().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
@@ -764,15 +836,20 @@ fn wal_stream_connection_string(
|
||||
listen_pg_addr_str: &str,
|
||||
) -> anyhow::Result<String> {
|
||||
let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db");
|
||||
let me_conf = sk_connstr
|
||||
.parse::<postgres::config::Config>()
|
||||
.with_context(|| {
|
||||
format!("Failed to parse pageserver connection string '{sk_connstr}' as a postgres one")
|
||||
})?;
|
||||
let (host, port) = utils::connstring::connection_host_port(&me_conf);
|
||||
Ok(format!(
|
||||
"host={host} port={port} options='-c timeline_id={timeline_id} tenant_id={tenant_id}'"
|
||||
))
|
||||
sk_connstr
|
||||
.parse()
|
||||
.context("bad url")
|
||||
.and_then(|url: url::Url| {
|
||||
let host = url.host_str().context("host is missing")?;
|
||||
let port = url.port().unwrap_or(5432); // default PG port
|
||||
|
||||
Ok(format!(
|
||||
"host={host} \
|
||||
port={port} \
|
||||
options='-c timeline_id={timeline_id} tenant_id={tenant_id}'"
|
||||
))
|
||||
})
|
||||
.with_context(|| format!("Failed to parse pageserver connection URL '{sk_connstr}'"))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -801,6 +878,7 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
safekeeper_connstr: None,
|
||||
},
|
||||
etcd_version: 0,
|
||||
@@ -817,7 +895,9 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
safekeeper_connstr: Some("no commit_lsn".to_string()),
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("no_commit_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
@@ -833,7 +913,8 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
safekeeper_connstr: Some("no commit_lsn".to_string()),
|
||||
local_start_lsn: None,
|
||||
safekeeper_connstr: Some("no_commit_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
@@ -849,6 +930,7 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
safekeeper_connstr: None,
|
||||
},
|
||||
etcd_version: 0,
|
||||
@@ -908,6 +990,8 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
@@ -924,7 +1008,9 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
safekeeper_connstr: Some("not advanced Lsn".to_string()),
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("not_advanced_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
@@ -940,7 +1026,9 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
safekeeper_connstr: Some("not enough advanced Lsn".to_string()),
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
@@ -974,6 +1062,8 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
@@ -1006,7 +1096,9 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
safekeeper_connstr: Some("smaller commit_lsn".to_string()),
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("smaller_commit_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
@@ -1022,6 +1114,8 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
@@ -1038,6 +1132,8 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: None,
|
||||
},
|
||||
etcd_version: 0,
|
||||
@@ -1083,6 +1179,8 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
@@ -1099,6 +1197,8 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
@@ -1168,6 +1268,8 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
@@ -1184,7 +1286,9 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()),
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
@@ -1208,7 +1312,7 @@ mod tests {
|
||||
);
|
||||
assert!(over_threshcurrent_candidate
|
||||
.wal_source_connstr
|
||||
.contains("advanced by Lsn safekeeper"));
|
||||
.contains("advanced_by_lsn_safekeeper"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1255,6 +1359,8 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
@@ -1326,6 +1432,8 @@ mod tests {
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
//! process. Then we get the page image back. Communication with the
|
||||
//! postgres process happens via stdin/stdout
|
||||
//!
|
||||
//! See src/backend/tcop/zenith_wal_redo.c for the other side of
|
||||
//! See pgxn/neon_walredo/walredoproc.c for the other side of
|
||||
//! this communication.
|
||||
//!
|
||||
//! The Postgres process is assumed to be secure against malicious WAL
|
||||
@@ -43,10 +43,10 @@ use crate::metrics::{
|
||||
WAL_REDO_WAIT_TIME,
|
||||
};
|
||||
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::repository::Key;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
use postgres_ffi::v14::nonrelfile_utils::{
|
||||
@@ -644,14 +644,12 @@ impl PostgresRedoProcess {
|
||||
),
|
||||
));
|
||||
} else {
|
||||
// Limit shared cache for wal-redo-postres
|
||||
// Limit shared cache for wal-redo-postgres
|
||||
let mut config = OpenOptions::new()
|
||||
.append(true)
|
||||
.open(PathBuf::from(&datadir).join("postgresql.conf"))?;
|
||||
config.write_all(b"shared_buffers=128kB\n")?;
|
||||
config.write_all(b"fsync=off\n")?;
|
||||
config.write_all(b"shared_preload_libraries=neon\n")?;
|
||||
config.write_all(b"neon.wal_redo=on\n")?;
|
||||
}
|
||||
|
||||
// Start postgres itself
|
||||
@@ -664,18 +662,15 @@ impl PostgresRedoProcess {
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("PGDATA", &datadir)
|
||||
// The redo process is not trusted, so it runs in seccomp mode
|
||||
// (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't
|
||||
// inherit any file descriptors from the pageserver that would allow
|
||||
// an attacker to do bad things.
|
||||
// The redo process is not trusted, and runs in seccomp mode that
|
||||
// doesn't allow it to open any files. We have to also make sure it
|
||||
// doesn't inherit any file descriptors from the pageserver, that
|
||||
// would allow an attacker to read any files that happen to be open
|
||||
// in the pageserver.
|
||||
//
|
||||
// The Rust standard library makes sure to mark any file descriptors with
|
||||
// as close-on-exec by default, but that's not enough, since we use
|
||||
// libraries that directly call libc open without setting that flag.
|
||||
//
|
||||
// One example is the pidfile of the daemonize library, which doesn't
|
||||
// currently mark file descriptors as close-on-exec. Either way, we
|
||||
// want to be on the safe side and prevent accidental regression.
|
||||
.close_fds()
|
||||
.spawn()
|
||||
.map_err(|e| {
|
||||
@@ -844,7 +839,7 @@ impl PostgresRedoProcess {
|
||||
}
|
||||
|
||||
// Functions for constructing messages to send to the postgres WAL redo
|
||||
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
|
||||
// process. See pgxn/neon_walredo/walredoproc.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
MODULE_big = neon
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
inmem_smgr.o \
|
||||
libpagestore.o \
|
||||
libpqwalproposer.o \
|
||||
pagestore_smgr.o \
|
||||
|
||||
@@ -419,15 +419,6 @@ pg_init_libpagestore(void)
|
||||
0, /* no flags required */
|
||||
check_neon_id, NULL, NULL);
|
||||
|
||||
DefineCustomBoolVariable("neon.wal_redo",
|
||||
"start in wal-redo mode",
|
||||
NULL,
|
||||
&wal_redo,
|
||||
false,
|
||||
PGC_POSTMASTER,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.max_cluster_size",
|
||||
"cluster size limit",
|
||||
NULL,
|
||||
@@ -452,13 +443,7 @@ pg_init_libpagestore(void)
|
||||
neon_timeline_walproposer = neon_timeline;
|
||||
neon_tenant_walproposer = neon_tenant;
|
||||
|
||||
if (wal_redo)
|
||||
{
|
||||
neon_log(PageStoreTrace, "set inmem_smgr hook");
|
||||
smgr_hook = smgr_inmem;
|
||||
smgr_init_hook = smgr_init_inmem;
|
||||
}
|
||||
else if (page_server_connstring && page_server_connstring[0])
|
||||
if (page_server_connstring && page_server_connstring[0])
|
||||
{
|
||||
neon_log(PageStoreTrace, "set neon_smgr hook");
|
||||
smgr_hook = smgr_neon;
|
||||
|
||||
@@ -155,10 +155,6 @@ extern int32 max_cluster_size;
|
||||
extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode);
|
||||
extern void smgr_init_neon(void);
|
||||
|
||||
extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
|
||||
extern void smgr_init_inmem(void);
|
||||
extern void smgr_shutdown_inmem(void);
|
||||
|
||||
/* Neon storage manager functionality */
|
||||
|
||||
extern void neon_init(void);
|
||||
@@ -188,29 +184,6 @@ extern void neon_truncate(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber nblocks);
|
||||
extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum);
|
||||
|
||||
/* neon wal-redo storage manager functionality */
|
||||
|
||||
extern void inmem_init(void);
|
||||
extern void inmem_open(SMgrRelation reln);
|
||||
extern void inmem_close(SMgrRelation reln, ForkNumber forknum);
|
||||
extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
|
||||
extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
|
||||
extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
|
||||
extern void inmem_extend(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum);
|
||||
extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
char *buffer);
|
||||
extern void inmem_write(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, BlockNumber nblocks);
|
||||
extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
|
||||
extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber nblocks);
|
||||
extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
|
||||
|
||||
/* utils for neon relsize cache */
|
||||
extern void relsize_hash_init(void);
|
||||
extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size);
|
||||
|
||||
@@ -99,7 +99,6 @@ char *page_server_connstring;
|
||||
/*with substituted password*/
|
||||
char *neon_timeline;
|
||||
char *neon_tenant;
|
||||
bool wal_redo = false;
|
||||
int32 max_cluster_size;
|
||||
|
||||
/* unlogged relation build states */
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
#include "access/xlogrecovery.h"
|
||||
#endif
|
||||
#include "storage/fd.h"
|
||||
#include "storage/latch.h"
|
||||
#include "miscadmin.h"
|
||||
#include "pgstat.h"
|
||||
@@ -69,7 +70,8 @@
|
||||
#include "neon.h"
|
||||
#include "walproposer.h"
|
||||
#include "walproposer_utils.h"
|
||||
#include "replication/walpropshim.h"
|
||||
|
||||
static bool syncSafekeepers = false;
|
||||
|
||||
char *wal_acceptors_list;
|
||||
int wal_acceptor_reconnect_timeout;
|
||||
@@ -117,8 +119,8 @@ static TimestampTz last_reconnect_attempt;
|
||||
static WalproposerShmemState * walprop_shared;
|
||||
|
||||
/* Prototypes for private functions */
|
||||
static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId);
|
||||
static void WalProposerStartImpl(void);
|
||||
static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
|
||||
static void WalProposerStart(void);
|
||||
static void WalProposerLoop(void);
|
||||
static void InitEventSet(void);
|
||||
static void UpdateEventSet(Safekeeper *sk, uint32 events);
|
||||
@@ -186,9 +188,56 @@ pg_init_walproposer(void)
|
||||
ProcessInterruptsCallback = backpressure_throttling_impl;
|
||||
|
||||
WalProposerRegister();
|
||||
}
|
||||
|
||||
WalProposerInit = &WalProposerInitImpl;
|
||||
WalProposerStart = &WalProposerStartImpl;
|
||||
/*
|
||||
* Entry point for `postgres --sync-safekeepers`.
|
||||
*/
|
||||
void
|
||||
WalProposerSync(int argc, char *argv[])
|
||||
{
|
||||
struct stat stat_buf;
|
||||
|
||||
syncSafekeepers = true;
|
||||
#if PG_VERSION_NUM < 150000
|
||||
ThisTimeLineID = 1;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Initialize postmaster_alive_fds as WaitEventSet checks them.
|
||||
*
|
||||
* Copied from InitPostmasterDeathWatchHandle()
|
||||
*/
|
||||
if (pipe(postmaster_alive_fds) < 0)
|
||||
ereport(FATAL,
|
||||
(errcode_for_file_access(),
|
||||
errmsg_internal("could not create pipe to monitor postmaster death: %m")));
|
||||
if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
|
||||
ereport(FATAL,
|
||||
(errcode_for_socket_access(),
|
||||
errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
|
||||
|
||||
ChangeToDataDir();
|
||||
|
||||
/* Create pg_wal directory, if it doesn't exist */
|
||||
if (stat(XLOGDIR, &stat_buf) != 0)
|
||||
{
|
||||
ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR)));
|
||||
if (MakePGDirectory(XLOGDIR) < 0)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not create directory \"%s\": %m",
|
||||
XLOGDIR)));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
WalProposerInit(0, 0);
|
||||
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
WalProposerStart();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -429,7 +478,7 @@ WalProposerRegister(void)
|
||||
}
|
||||
|
||||
static void
|
||||
WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
{
|
||||
char *host;
|
||||
char *sep;
|
||||
@@ -508,7 +557,7 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
}
|
||||
|
||||
static void
|
||||
WalProposerStartImpl(void)
|
||||
WalProposerStart(void)
|
||||
{
|
||||
|
||||
/* Initiate connections to all safekeeper nodes */
|
||||
|
||||
@@ -10,9 +10,6 @@ EXTENSION = neon_test_utils
|
||||
DATA = neon_test_utils--1.0.sql
|
||||
PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
|
||||
|
||||
PG_CPPFLAGS = -I$(libpq_srcdir)
|
||||
SHLIB_LINK_INTERNAL = $(libpq)
|
||||
|
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS)
|
||||
|
||||
@@ -23,11 +23,6 @@ RETURNS bytea
|
||||
AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
|
||||
LANGUAGE C PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION neon_seqscan_rel(rel regclass, nprefetch int DEFAULT 0)
|
||||
RETURNS void
|
||||
AS 'MODULE_PATHNAME', 'neon_seqscan_rel'
|
||||
LANGUAGE C PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'neon_xlogflush'
|
||||
|
||||
@@ -23,13 +23,8 @@
|
||||
#include "utils/pg_lsn.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/varlena.h"
|
||||
#include "utils/wait_event.h"
|
||||
#include "../neon/pagestore_client.h"
|
||||
|
||||
#include "libpq-fe.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "libpq/libpq.h"
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
|
||||
extern void _PG_init(void);
|
||||
@@ -39,7 +34,6 @@ PG_FUNCTION_INFO_V1(clear_buffer_cache);
|
||||
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
|
||||
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
|
||||
PG_FUNCTION_INFO_V1(neon_xlogflush);
|
||||
PG_FUNCTION_INFO_V1(neon_seqscan_rel);
|
||||
|
||||
/*
|
||||
* Linkage to functions in neon module.
|
||||
@@ -295,238 +289,6 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* A wrapper around PQgetCopyData that checks for interrupts while sleeping.
|
||||
*/
|
||||
static int
|
||||
call_PQgetCopyData(PGconn *conn, char **buffer)
|
||||
{
|
||||
int ret;
|
||||
|
||||
retry:
|
||||
ret = PQgetCopyData(conn, buffer, 1 /* async */ );
|
||||
|
||||
if (ret == 0)
|
||||
{
|
||||
int wc;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
wc = WaitLatchOrSocket(MyLatch,
|
||||
WL_LATCH_SET | WL_SOCKET_READABLE |
|
||||
WL_EXIT_ON_PM_DEATH,
|
||||
PQsocket(conn),
|
||||
-1L, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* Data available in socket? */
|
||||
if (wc & WL_SOCKET_READABLE)
|
||||
{
|
||||
if (!PQconsumeInput(conn))
|
||||
elog(ERROR, "could not get response from pageserver: %s",
|
||||
PQerrorMessage(conn));
|
||||
}
|
||||
|
||||
goto retry;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void send_getpage_request(PGconn *pageserver_conn, RelFileNode rnode, BlockNumber blkno, XLogRecPtr lsn);
|
||||
|
||||
/*
|
||||
* Fetch all pages of given relation. This simulates a sequential scan
|
||||
* over the table. You can specify the number of blocks to prefetch;
|
||||
* the function will try to keep that many requests "in flight" at all
|
||||
* times. The fetched pages are simply discarded.
|
||||
*/
|
||||
Datum
|
||||
neon_seqscan_rel(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Oid relid = PG_GETARG_OID(0);
|
||||
Oid nprefetch = PG_GETARG_INT32(1);
|
||||
Relation rel;
|
||||
char *raw_page_data;
|
||||
BlockNumber nblocks;
|
||||
PGconn *pageserver_conn;
|
||||
XLogRecPtr read_lsn;
|
||||
|
||||
if (!superuser())
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
|
||||
errmsg("must be superuser to use raw page functions")));
|
||||
|
||||
rel = relation_open(relid, AccessShareLock);
|
||||
|
||||
nblocks = RelationGetNumberOfBlocks(rel);
|
||||
|
||||
pageserver_conn = PQconnectdb(page_server_connstring);
|
||||
if (PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
PQfinish(pageserver_conn);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg("could not establish connection to pageserver"),
|
||||
errdetail_internal("%s", msg)));
|
||||
}
|
||||
PG_TRY();
|
||||
{
|
||||
char *query;
|
||||
int ret;
|
||||
StringInfoData resp_buff;
|
||||
|
||||
read_lsn = GetXLogInsertRecPtr();
|
||||
|
||||
query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
|
||||
ret = PQsendQuery(pageserver_conn, query);
|
||||
if (ret != 1)
|
||||
{
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
elog(ERROR, "could not send pagestream command to pageserver");
|
||||
}
|
||||
|
||||
while (PQisBusy(pageserver_conn))
|
||||
{
|
||||
int wc;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
wc = WaitLatchOrSocket(MyLatch,
|
||||
WL_LATCH_SET | WL_SOCKET_READABLE |
|
||||
WL_EXIT_ON_PM_DEATH,
|
||||
PQsocket(pageserver_conn),
|
||||
-1L, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* Data available in socket? */
|
||||
if (wc & WL_SOCKET_READABLE)
|
||||
{
|
||||
if (!PQconsumeInput(pageserver_conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
|
||||
elog(ERROR, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
elog(INFO, "scanning %u blocks, prefetch %u", nblocks, nprefetch);
|
||||
|
||||
BlockNumber nsent = 0;
|
||||
for (BlockNumber blkno = 0; blkno < nblocks; blkno++)
|
||||
{
|
||||
NeonGetPageRequest request = {
|
||||
.req.tag = T_NeonGetPageRequest,
|
||||
.req.latest = true,
|
||||
.req.lsn = read_lsn,
|
||||
.rnode = rel->rd_node,
|
||||
.forknum = MAIN_FORKNUM,
|
||||
.blkno = blkno
|
||||
};
|
||||
NeonResponse *resp;
|
||||
|
||||
if (blkno % 1024 == 0)
|
||||
elog(INFO, "blk %u/%u", blkno, nblocks);
|
||||
|
||||
if (nsent < blkno + nprefetch + 1 && nsent < nblocks)
|
||||
{
|
||||
while (nsent < blkno + nprefetch + 1 && nsent < nblocks)
|
||||
send_getpage_request(pageserver_conn, rel->rd_node, nsent++, read_lsn);
|
||||
|
||||
if (PQflush(pageserver_conn))
|
||||
{
|
||||
char *msg = PQerrorMessage(pageserver_conn);
|
||||
|
||||
elog(ERROR, "failed to flush page requests: %s", msg);
|
||||
}
|
||||
}
|
||||
|
||||
/* read response */
|
||||
resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
|
||||
resp_buff.cursor = 0;
|
||||
|
||||
if (resp_buff.len < 0)
|
||||
{
|
||||
if (resp_buff.len == -1)
|
||||
elog(ERROR, "end of COPY");
|
||||
else if (resp_buff.len == -2)
|
||||
elog(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
|
||||
}
|
||||
resp = nm_unpack_response(&resp_buff);
|
||||
|
||||
switch (resp->tag)
|
||||
{
|
||||
case T_NeonGetPageResponse:
|
||||
/* ok */
|
||||
break;
|
||||
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read block %u", blkno),
|
||||
errdetail("page server returned error: %s",
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
|
||||
default:
|
||||
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
|
||||
PQfreemem(resp_buff.data);
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
PQfinish(pageserver_conn);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
relation_close(rel, AccessShareLock);
|
||||
}
|
||||
|
||||
static void
|
||||
send_getpage_request(PGconn *pageserver_conn, RelFileNode rnode, BlockNumber blkno, XLogRecPtr lsn)
|
||||
{
|
||||
NeonGetPageRequest request = {
|
||||
.req.tag = T_NeonGetPageRequest,
|
||||
.req.latest = true,
|
||||
.req.lsn = lsn,
|
||||
.rnode = rnode,
|
||||
.forknum = MAIN_FORKNUM,
|
||||
.blkno = blkno
|
||||
};
|
||||
StringInfoData req_buff;
|
||||
|
||||
req_buff = nm_pack_request(&request.req);
|
||||
/*
|
||||
* Send request.
|
||||
*
|
||||
* In principle, this could block if the output buffer is full, and we
|
||||
* should use async mode and check for interrupts while waiting. In
|
||||
* practice, our requests are small enough to always fit in the output and
|
||||
* TCP buffer.
|
||||
*/
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
|
||||
{
|
||||
char *msg = PQerrorMessage(pageserver_conn);
|
||||
|
||||
elog(ERROR, "failed to send page request: %s", msg);
|
||||
}
|
||||
pfree(req_buff.data);
|
||||
}
|
||||
|
||||
/*
|
||||
* Directly calls XLogFlush(lsn) to flush WAL buffers.
|
||||
*/
|
||||
|
||||
22
pgxn/neon_walredo/Makefile
Normal file
22
pgxn/neon_walredo/Makefile
Normal file
@@ -0,0 +1,22 @@
|
||||
# pgxs/neon_walredo/Makefile
|
||||
|
||||
MODULE_big = neon_walredo
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
inmem_smgr.o \
|
||||
walredoproc.o \
|
||||
|
||||
# This really should be guarded by $(with_libseccomp), but I couldn't
|
||||
# make that work with pgxs. So we always compile it, but its contents
|
||||
# are wrapped in #ifdef HAVE_LIBSECCOMP instead.
|
||||
OBJS += seccomp.o
|
||||
|
||||
PGFILEDESC = "neon_walredo - helper process that runs in Neon pageserver"
|
||||
|
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS)
|
||||
|
||||
ifeq ($(with_libseccomp),yes)
|
||||
SHLIB_LINK += -lseccomp
|
||||
endif
|
||||
@@ -3,9 +3,8 @@
|
||||
* inmem_smgr.c
|
||||
*
|
||||
* This is an implementation of the SMGR interface, used in the WAL redo
|
||||
* process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
|
||||
* storage, the pages that are written out are kept in a small number of
|
||||
* in-memory buffers.
|
||||
* process. It has no persistent storage, the pages that are written out
|
||||
* are kept in a small number of in-memory buffers.
|
||||
*
|
||||
* Normally, replaying a WAL record only needs to access a handful of
|
||||
* buffers, which fit in the normal buffer cache, so this is just for
|
||||
@@ -15,15 +14,11 @@
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon/inmem_smgr.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/xlog.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "storage/block.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/relfilenode.h"
|
||||
@@ -33,6 +28,8 @@
|
||||
#include "access/xlogutils.h"
|
||||
#endif
|
||||
|
||||
#include "inmem_smgr.h"
|
||||
|
||||
/* Size of the in-memory smgr */
|
||||
#define MAX_PAGES 64
|
||||
|
||||
@@ -59,10 +56,34 @@ locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
/* neon wal-redo storage manager functionality */
|
||||
static void inmem_init(void);
|
||||
static void inmem_open(SMgrRelation reln);
|
||||
static void inmem_close(SMgrRelation reln, ForkNumber forknum);
|
||||
static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
|
||||
static bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
|
||||
static void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
|
||||
static void inmem_extend(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum);
|
||||
static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
char *buffer);
|
||||
static void inmem_write(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
static void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, BlockNumber nblocks);
|
||||
static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
|
||||
static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber nblocks);
|
||||
static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
|
||||
|
||||
|
||||
/*
|
||||
* inmem_init() -- Initialize private state
|
||||
*/
|
||||
void
|
||||
static void
|
||||
inmem_init(void)
|
||||
{
|
||||
used_pages = 0;
|
||||
@@ -71,7 +92,7 @@ inmem_init(void)
|
||||
/*
|
||||
* inmem_exists() -- Does the physical file exist?
|
||||
*/
|
||||
bool
|
||||
static bool
|
||||
inmem_exists(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
for (int i = 0; i < used_pages; i++)
|
||||
@@ -90,7 +111,7 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum)
|
||||
*
|
||||
* If isRedo is true, it's okay for the relation to exist already.
|
||||
*/
|
||||
void
|
||||
static void
|
||||
inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
|
||||
{
|
||||
}
|
||||
@@ -98,7 +119,7 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
|
||||
/*
|
||||
* inmem_unlink() -- Unlink a relation.
|
||||
*/
|
||||
void
|
||||
static void
|
||||
inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
|
||||
{
|
||||
}
|
||||
@@ -112,7 +133,7 @@ inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
|
||||
* EOF). Note that we assume writing a block beyond current EOF
|
||||
* causes intervening file space to become filled with zeroes.
|
||||
*/
|
||||
void
|
||||
static void
|
||||
inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
char *buffer, bool skipFsync)
|
||||
{
|
||||
@@ -123,7 +144,7 @@ inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
/*
|
||||
* inmem_open() -- Initialize newly-opened relation.
|
||||
*/
|
||||
void
|
||||
static void
|
||||
inmem_open(SMgrRelation reln)
|
||||
{
|
||||
}
|
||||
@@ -131,7 +152,7 @@ inmem_open(SMgrRelation reln)
|
||||
/*
|
||||
* inmem_close() -- Close the specified relation, if it isn't closed already.
|
||||
*/
|
||||
void
|
||||
static void
|
||||
inmem_close(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
}
|
||||
@@ -139,7 +160,7 @@ inmem_close(SMgrRelation reln, ForkNumber forknum)
|
||||
/*
|
||||
* inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
|
||||
*/
|
||||
bool
|
||||
static bool
|
||||
inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
{
|
||||
return true;
|
||||
@@ -148,7 +169,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
/*
|
||||
* inmem_writeback() -- Tell the kernel to write pages back to storage.
|
||||
*/
|
||||
void
|
||||
static void
|
||||
inmem_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, BlockNumber nblocks)
|
||||
{
|
||||
@@ -157,7 +178,7 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
/*
|
||||
* inmem_read() -- Read the specified block from a relation.
|
||||
*/
|
||||
void
|
||||
static void
|
||||
inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
char *buffer)
|
||||
{
|
||||
@@ -177,7 +198,7 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
* relation (ie, those before the current EOF). To extend a relation,
|
||||
* use mdextend().
|
||||
*/
|
||||
void
|
||||
static void
|
||||
inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
char *buffer, bool skipFsync)
|
||||
{
|
||||
@@ -224,7 +245,7 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
/*
|
||||
* inmem_nblocks() -- Get the number of blocks stored in a relation.
|
||||
*/
|
||||
BlockNumber
|
||||
static BlockNumber
|
||||
inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
/*
|
||||
@@ -243,7 +264,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
/*
|
||||
* inmem_truncate() -- Truncate relation to specified number of blocks.
|
||||
*/
|
||||
void
|
||||
static void
|
||||
inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
{
|
||||
}
|
||||
@@ -251,7 +272,7 @@ inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
/*
|
||||
* inmem_immedsync() -- Immediately sync a relation to stable storage.
|
||||
*/
|
||||
void
|
||||
static void
|
||||
inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
}
|
||||
17
pgxn/neon_walredo/inmem_smgr.h
Normal file
17
pgxn/neon_walredo/inmem_smgr.h
Normal file
@@ -0,0 +1,17 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* inmem_smgr.h
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#ifndef INMEM_SMGR_H
|
||||
#define INMEM_SMGR_H
|
||||
|
||||
extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
|
||||
extern void smgr_init_inmem(void);
|
||||
|
||||
#endif /* INMEM_SMGR_H */
|
||||
22
pgxn/neon_walredo/neon_seccomp.h
Normal file
22
pgxn/neon_walredo/neon_seccomp.h
Normal file
@@ -0,0 +1,22 @@
|
||||
#ifndef NEON_SECCOMP_H
|
||||
#define NEON_SECCOMP_H
|
||||
|
||||
#include <seccomp.h>
|
||||
|
||||
typedef struct {
|
||||
int psr_syscall; /* syscall number */
|
||||
uint32 psr_action; /* libseccomp action, e.g. SCMP_ACT_ALLOW */
|
||||
} PgSeccompRule;
|
||||
|
||||
#define PG_SCMP(syscall, action) \
|
||||
(PgSeccompRule) { \
|
||||
.psr_syscall = SCMP_SYS(syscall), \
|
||||
.psr_action = (action), \
|
||||
}
|
||||
|
||||
#define PG_SCMP_ALLOW(syscall) \
|
||||
PG_SCMP(syscall, SCMP_ACT_ALLOW)
|
||||
|
||||
extern void seccomp_load_rules(PgSeccompRule *syscalls, int count);
|
||||
|
||||
#endif /* NEON_SECCOMP_H */
|
||||
257
pgxn/neon_walredo/seccomp.c
Normal file
257
pgxn/neon_walredo/seccomp.c
Normal file
@@ -0,0 +1,257 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* seccomp.c
|
||||
* Secure Computing BPF API wrapper.
|
||||
*
|
||||
* Pageserver delegates complex WAL decoding duties to postgres,
|
||||
* which means that the latter might fall victim to carefully designed
|
||||
* malicious WAL records and start doing harmful things to the system.
|
||||
* To prevent this, it has been decided to limit possible interactions
|
||||
* with the outside world using the Secure Computing BPF mode.
|
||||
*
|
||||
* We use this mode to disable all syscalls not in the allowlist. This
|
||||
* approach has its pros & cons:
|
||||
*
|
||||
* - We have to carefully handpick and maintain the set of syscalls
|
||||
* required for the WAL redo process. Core dumps help with that.
|
||||
* The method of trial and error seems to work reasonably well,
|
||||
* but it would be nice to find a proper way to "prove" that
|
||||
* the set in question is both necessary and sufficient.
|
||||
*
|
||||
* - Once we enter the seccomp bpf mode, it's impossible to lift those
|
||||
* restrictions (otherwise, what kind of "protection" would that be?).
|
||||
* Thus, we have to either enable extra syscalls for the clean shutdown,
|
||||
* or exit the process immediately via _exit() instead of proc_exit().
|
||||
*
|
||||
* - Should we simply use SCMP_ACT_KILL_PROCESS, or implement a custom
|
||||
* facility to deal with the forbidden syscalls? If we'd like to embed
|
||||
* a startup security test, we should go with the latter; In that
|
||||
* case, which one of the following options is preferable?
|
||||
*
|
||||
* * Catch the denied syscalls with a signal handler using SCMP_ACT_TRAP.
|
||||
* Provide a common signal handler with a static switch to override
|
||||
* its behavior for the test case. This would undermine the whole
|
||||
* purpose of such protection, so we'd have to go further and remap
|
||||
* the memory backing the switch as readonly, then ban mprotect().
|
||||
* Ugly and fragile, to say the least.
|
||||
*
|
||||
* * Yet again, catch the denied syscalls using SCMP_ACT_TRAP.
|
||||
* Provide 2 different signal handlers: one for a test case,
|
||||
* another for the main processing loop. Install the first one,
|
||||
* enable seccomp, perform the test, switch to the second one,
|
||||
* finally ban sigaction(), presto!
|
||||
*
|
||||
* * Spoof the result of a syscall using SECCOMP_RET_ERRNO for the
|
||||
* test, then ban it altogether with another filter. The downside
|
||||
* of this solution is that we don't actually check that
|
||||
* SCMP_ACT_KILL_PROCESS/SCMP_ACT_TRAP works.
|
||||
*
|
||||
* Either approach seems to require two eBPF filter programs,
|
||||
* which is unfortunate: the man page tells this is uncommon.
|
||||
* Maybe I (@funbringer) am missing something, though; I encourage
|
||||
* any reader to get familiar with it and scrutinize my conclusions.
|
||||
*
|
||||
* TODOs and ideas in no particular order:
|
||||
*
|
||||
* - Do something about mmap() in musl's malloc().
|
||||
* Definitely not a priority if we don't care about musl.
|
||||
*
|
||||
* - See if we can untangle PG's shutdown sequence (involving unlink()):
|
||||
*
|
||||
* * Simplify (or rather get rid of) shmem setup in PG's WAL redo mode.
|
||||
* * Investigate chroot() or mount namespaces for better FS isolation.
|
||||
* * (Per Heikki) Simply call _exit(), no big deal.
|
||||
* * Come up with a better idea?
|
||||
*
|
||||
* - Make use of seccomp's argument inspection (for what?).
|
||||
* Unfortunately, it views all syscall arguments as scalars,
|
||||
* so it won't work for e.g. string comparison in unlink().
|
||||
*
|
||||
* - Benchmark with bpf jit on/off, try seccomp_syscall_priority().
|
||||
*
|
||||
* - Test against various linux distros & glibc versions.
|
||||
* I suspect that certain libc functions might involve slightly
|
||||
* different syscalls, e.g. select/pselect6/pselect6_time64/whatever.
|
||||
*
|
||||
* - Test on any arch other than amd64 to see if it works there.
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
/*
|
||||
* I couldn't find a good way to do a conditional OBJS += seccomp.o in
|
||||
* the Makefile, so this file is compiled even when seccomp is disabled,
|
||||
* it's just empty in that case.
|
||||
*/
|
||||
#ifdef HAVE_LIBSECCOMP
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "miscadmin.h"
|
||||
|
||||
#include "neon_seccomp.h"
|
||||
|
||||
static void die(int code, const char *str);
|
||||
|
||||
static bool seccomp_test_sighandler_done = false;
|
||||
static void seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt);
|
||||
static void seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt);
|
||||
|
||||
static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action);
|
||||
|
||||
void
|
||||
seccomp_load_rules(PgSeccompRule *rules, int count)
|
||||
{
|
||||
struct sigaction action = { .sa_flags = SA_SIGINFO };
|
||||
PgSeccompRule rule;
|
||||
long fd;
|
||||
|
||||
/*
|
||||
* Install a test signal handler.
|
||||
* XXX: pqsignal() is too restrictive for our purposes,
|
||||
* since we'd like to examine the contents of siginfo_t.
|
||||
*/
|
||||
action.sa_sigaction = seccomp_test_sighandler;
|
||||
if (sigaction(SIGSYS, &action, NULL) != 0)
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_SYSTEM_ERROR),
|
||||
errmsg("seccomp: could not install test SIGSYS handler")));
|
||||
|
||||
/*
|
||||
* First, check that open of a well-known file works.
|
||||
* XXX: We use raw syscall() to call the very open().
|
||||
*/
|
||||
fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
|
||||
if (seccomp_test_sighandler_done)
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_SYSTEM_ERROR),
|
||||
errmsg("seccomp: signal handler test flag was set unexpectedly")));
|
||||
if (fd < 0)
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_SYSTEM_ERROR),
|
||||
errmsg("seccomp: could not open /dev/null for seccomp testing: %m")));
|
||||
close((int) fd);
|
||||
|
||||
/* Set a trap on open() to test seccomp bpf */
|
||||
rule = PG_SCMP(open, SCMP_ACT_TRAP);
|
||||
if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0)
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_SYSTEM_ERROR),
|
||||
errmsg("seccomp: could not load test trap")));
|
||||
|
||||
/* Finally, check that open() now raises SIGSYS */
|
||||
(void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
|
||||
if (!seccomp_test_sighandler_done)
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_SYSTEM_ERROR),
|
||||
errmsg("seccomp: SIGSYS handler doesn't seem to work")));
|
||||
|
||||
/* Now that everything seems to work, install a proper handler */
|
||||
action.sa_sigaction = seccomp_deny_sighandler;
|
||||
if (sigaction(SIGSYS, &action, NULL) != 0)
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_SYSTEM_ERROR),
|
||||
errmsg("seccomp: could not install SIGSYS handler")));
|
||||
|
||||
/* If this succeeds, any syscall not in the list will crash the process */
|
||||
if (do_seccomp_load_rules(rules, count, SCMP_ACT_TRAP) != 0)
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_SYSTEM_ERROR),
|
||||
errmsg("seccomp: could not enter seccomp mode")));
|
||||
}
|
||||
|
||||
/*
|
||||
* Enter seccomp mode with a BPF filter that will only allow
|
||||
* certain syscalls to proceed.
|
||||
*/
|
||||
static int
|
||||
do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action)
|
||||
{
|
||||
scmp_filter_ctx ctx;
|
||||
int rc = -1;
|
||||
|
||||
/* Create a context with a default action for syscalls not in the list */
|
||||
if ((ctx = seccomp_init(def_action)) == NULL)
|
||||
goto cleanup;
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
PgSeccompRule *rule = &rules[i];
|
||||
if ((rc = seccomp_rule_add(ctx, rule->psr_action, rule->psr_syscall, 0)) != 0)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Try building & loading the program into the kernel */
|
||||
if ((rc = seccomp_load(ctx)) != 0)
|
||||
goto cleanup;
|
||||
|
||||
cleanup:
|
||||
/*
|
||||
* We don't need the context anymore regardless of the result,
|
||||
* since either we failed or the eBPF program has already been
|
||||
* loaded into the linux kernel.
|
||||
*/
|
||||
seccomp_release(ctx);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void
|
||||
die(int code, const char *str)
|
||||
{
|
||||
/* work around gcc ignoring that it shouldn't warn on (void) result being unused */
|
||||
ssize_t _unused pg_attribute_unused();
|
||||
/* Best effort write to stderr */
|
||||
_unused = write(fileno(stderr), str, strlen(str));
|
||||
|
||||
/* XXX: we don't want to run any atexit callbacks */
|
||||
_exit(code);
|
||||
}
|
||||
|
||||
static void
|
||||
seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused())
|
||||
{
|
||||
#define DIE_PREFIX "seccomp test signal handler: "
|
||||
|
||||
/* Check that this signal handler is used only for a single test case */
|
||||
if (seccomp_test_sighandler_done)
|
||||
die(1, DIE_PREFIX "test handler should only be used for 1 test\n");
|
||||
seccomp_test_sighandler_done = true;
|
||||
|
||||
if (signum != SIGSYS)
|
||||
die(1, DIE_PREFIX "bad signal number\n");
|
||||
|
||||
/* TODO: maybe somehow extract the hardcoded syscall number */
|
||||
if (info->si_syscall != SCMP_SYS(open))
|
||||
die(1, DIE_PREFIX "bad syscall number\n");
|
||||
|
||||
#undef DIE_PREFIX
|
||||
}
|
||||
|
||||
static void
|
||||
seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused())
|
||||
{
|
||||
/*
|
||||
* Unfortunately, we can't use seccomp_syscall_resolve_num_arch()
|
||||
* to resolve the syscall's name, since it calls strdup()
|
||||
* under the hood (wtf!).
|
||||
*/
|
||||
char buffer[128];
|
||||
(void)snprintf(buffer, lengthof(buffer),
|
||||
"---------------------------------------\n"
|
||||
"seccomp: bad syscall %d\n"
|
||||
"---------------------------------------\n",
|
||||
info->si_syscall);
|
||||
|
||||
/*
|
||||
* Instead of silently crashing the process with
|
||||
* a fake SIGSYS caused by SCMP_ACT_KILL_PROCESS,
|
||||
* we'd like to receive a real SIGSYS to print the
|
||||
* message and *then* immediately exit.
|
||||
*/
|
||||
die(1, buffer);
|
||||
}
|
||||
|
||||
#endif /* HAVE_LIBSECCOMP */
|
||||
847
pgxn/neon_walredo/walredoproc.c
Normal file
847
pgxn/neon_walredo/walredoproc.c
Normal file
@@ -0,0 +1,847 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* walredoproc.c
|
||||
* Entry point for WAL redo helper
|
||||
*
|
||||
*
|
||||
* This file contains an alternative main() function for the 'postgres'
|
||||
* binary. In the special mode, we go into a special mode that's similar
|
||||
* to the single user mode. We don't launch postmaster or any auxiliary
|
||||
* processes. Instead, we wait for command from 'stdin', and respond to
|
||||
* 'stdout'.
|
||||
*
|
||||
* The protocol through stdin/stdout is loosely based on the libpq protocol.
|
||||
* The process accepts messages through stdin, and each message has the format:
|
||||
*
|
||||
* char msgtype;
|
||||
* int32 length; // length of message including 'length' but excluding
|
||||
* // 'msgtype', in network byte order
|
||||
* <payload>
|
||||
*
|
||||
* There are three message types:
|
||||
*
|
||||
* BeginRedoForBlock ('B'): Prepare for WAL replay for given block
|
||||
* PushPage ('P'): Copy a page image (in the payload) to buffer cache
|
||||
* ApplyRecord ('A'): Apply a WAL record (in the payload)
|
||||
* GetPage ('G'): Return a page image from buffer cache.
|
||||
*
|
||||
* Currently, you only get a response to GetPage requests; the response is
|
||||
* simply a 8k page, without any headers. Errors are logged to stderr.
|
||||
*
|
||||
* FIXME:
|
||||
* - this currently requires a valid PGDATA, and creates a lock file there
|
||||
* like a normal postmaster. There's no fundamental reason for that, though.
|
||||
* - should have EndRedoForBlock, and flush page cache, to allow using this
|
||||
* mechanism for more than one block without restarting the process.
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <limits.h>
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/socket.h>
|
||||
#ifdef HAVE_SYS_SELECT_H
|
||||
#include <sys/select.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_RESOURCE_H
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_LIBSECCOMP) && defined(__GLIBC__)
|
||||
#define MALLOC_NO_MMAP
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_GETRUSAGE
|
||||
#include "rusagestub.h"
|
||||
#endif
|
||||
|
||||
#include "access/xlog.h"
|
||||
#include "access/xlog_internal.h"
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
#include "access/xlogrecovery.h"
|
||||
#endif
|
||||
#include "access/xlogutils.h"
|
||||
#include "catalog/pg_class.h"
|
||||
#include "libpq/libpq.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "miscadmin.h"
|
||||
#include "postmaster/postmaster.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/proc.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "tcop/tcopprot.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/ps_status.h"
|
||||
|
||||
#include "inmem_smgr.h"
|
||||
|
||||
#ifdef HAVE_LIBSECCOMP
|
||||
#include "neon_seccomp.h"
|
||||
#endif
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
|
||||
static int ReadRedoCommand(StringInfo inBuf);
|
||||
static void BeginRedoForBlock(StringInfo input_message);
|
||||
static void PushPage(StringInfo input_message);
|
||||
static void ApplyRecord(StringInfo input_message);
|
||||
static void apply_error_callback(void *arg);
|
||||
static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
|
||||
static void GetPage(StringInfo input_message);
|
||||
static ssize_t buffered_read(void *buf, size_t count);
|
||||
|
||||
static BufferTag target_redo_tag;
|
||||
|
||||
static XLogReaderState *reader_state;
|
||||
|
||||
#define TRACE DEBUG5
|
||||
|
||||
#ifdef HAVE_LIBSECCOMP
|
||||
static void
|
||||
enter_seccomp_mode(void)
|
||||
{
|
||||
PgSeccompRule syscalls[] =
|
||||
{
|
||||
/* Hard requirements */
|
||||
PG_SCMP_ALLOW(exit_group),
|
||||
PG_SCMP_ALLOW(pselect6),
|
||||
PG_SCMP_ALLOW(read),
|
||||
PG_SCMP_ALLOW(select),
|
||||
PG_SCMP_ALLOW(write),
|
||||
|
||||
/* Memory allocation */
|
||||
PG_SCMP_ALLOW(brk),
|
||||
#ifndef MALLOC_NO_MMAP
|
||||
/* TODO: musl doesn't have mallopt */
|
||||
PG_SCMP_ALLOW(mmap),
|
||||
PG_SCMP_ALLOW(munmap),
|
||||
#endif
|
||||
/*
|
||||
* getpid() is called on assertion failure, in ExceptionalCondition.
|
||||
* It's not really needed, but seems pointless to hide it either. The
|
||||
* system call unlikely to expose a kernel vulnerability, and the PID
|
||||
* is stored in MyProcPid anyway.
|
||||
*/
|
||||
PG_SCMP_ALLOW(getpid),
|
||||
|
||||
/* Enable those for a proper shutdown.
|
||||
PG_SCMP_ALLOW(munmap),
|
||||
PG_SCMP_ALLOW(shmctl),
|
||||
PG_SCMP_ALLOW(shmdt),
|
||||
PG_SCMP_ALLOW(unlink), // shm_unlink
|
||||
*/
|
||||
};
|
||||
|
||||
#ifdef MALLOC_NO_MMAP
|
||||
/* Ask glibc not to use mmap() */
|
||||
mallopt(M_MMAP_MAX, 0);
|
||||
#endif
|
||||
|
||||
seccomp_load_rules(syscalls, lengthof(syscalls));
|
||||
}
|
||||
#endif /* HAVE_LIBSECCOMP */
|
||||
|
||||
/*
|
||||
* Entry point for the WAL redo process.
|
||||
*
|
||||
* Performs similar initialization as PostgresMain does for normal
|
||||
* backend processes. Some initialization was done in CallExtMain
|
||||
* already.
|
||||
*/
|
||||
void
|
||||
WalRedoMain(int argc, char *argv[])
|
||||
{
|
||||
int firstchar;
|
||||
StringInfoData input_message;
|
||||
#ifdef HAVE_LIBSECCOMP
|
||||
bool enable_seccomp;
|
||||
#endif
|
||||
|
||||
am_wal_redo_postgres = true;
|
||||
|
||||
/*
|
||||
* WAL redo does not need a large number of buffers. And speed of
|
||||
* DropRelFileNodeAllLocalBuffers() is proportional to the number of
|
||||
* buffers. So let's keep it small (default value is 1024)
|
||||
*/
|
||||
num_temp_buffers = 4;
|
||||
|
||||
/*
|
||||
* install the simple in-memory smgr
|
||||
*/
|
||||
smgr_hook = smgr_inmem;
|
||||
smgr_init_hook = smgr_init_inmem;
|
||||
|
||||
/*
|
||||
* Validate we have been given a reasonable-looking DataDir and change into it.
|
||||
*/
|
||||
checkDataDir();
|
||||
ChangeToDataDir();
|
||||
|
||||
/*
|
||||
* Create lockfile for data directory.
|
||||
*/
|
||||
CreateDataDirLockFile(false);
|
||||
|
||||
/* read control file (error checking and contains config ) */
|
||||
LocalProcessControlFile(false);
|
||||
|
||||
/*
|
||||
* process any libraries that should be preloaded at postmaster start
|
||||
*/
|
||||
process_shared_preload_libraries();
|
||||
|
||||
/* Initialize MaxBackends (if under postmaster, was done already) */
|
||||
InitializeMaxBackends();
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
/*
|
||||
* Give preloaded libraries a chance to request additional shared memory.
|
||||
*/
|
||||
process_shmem_requests();
|
||||
|
||||
/*
|
||||
* Now that loadable modules have had their chance to request additional
|
||||
* shared memory, determine the value of any runtime-computed GUCs that
|
||||
* depend on the amount of shared memory required.
|
||||
*/
|
||||
InitializeShmemGUCs();
|
||||
|
||||
/*
|
||||
* Now that modules have been loaded, we can process any custom resource
|
||||
* managers specified in the wal_consistency_checking GUC.
|
||||
*/
|
||||
InitializeWalConsistencyChecking();
|
||||
#endif
|
||||
|
||||
CreateSharedMemoryAndSemaphores();
|
||||
|
||||
/*
|
||||
* Remember stand-alone backend startup time,roughly at the same point
|
||||
* during startup that postmaster does so.
|
||||
*/
|
||||
PgStartTime = GetCurrentTimestamp();
|
||||
|
||||
/*
|
||||
* Create a per-backend PGPROC struct in shared memory. We must do
|
||||
* this before we can use LWLocks.
|
||||
*/
|
||||
InitAuxiliaryProcess();
|
||||
|
||||
SetProcessingMode(NormalProcessing);
|
||||
|
||||
/* Redo routines won't work if we're not "in recovery" */
|
||||
InRecovery = true;
|
||||
|
||||
/*
|
||||
* Create the memory context we will use in the main loop.
|
||||
*
|
||||
* MessageContext is reset once per iteration of the main loop, ie, upon
|
||||
* completion of processing of each command message from the client.
|
||||
*/
|
||||
MessageContext = AllocSetContextCreate(TopMemoryContext,
|
||||
"MessageContext",
|
||||
ALLOCSET_DEFAULT_SIZES);
|
||||
|
||||
/* we need a ResourceOwner to hold buffer pins */
|
||||
Assert(CurrentResourceOwner == NULL);
|
||||
CurrentResourceOwner = ResourceOwnerCreate(NULL, "wal redo");
|
||||
|
||||
/* Initialize resource managers */
|
||||
for (int rmid = 0; rmid <= RM_MAX_ID; rmid++)
|
||||
{
|
||||
if (RmgrTable[rmid].rm_startup != NULL)
|
||||
RmgrTable[rmid].rm_startup();
|
||||
}
|
||||
reader_state = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(), NULL);
|
||||
|
||||
#ifdef HAVE_LIBSECCOMP
|
||||
/* We prefer opt-out to opt-in for greater security */
|
||||
enable_seccomp = true;
|
||||
for (int i = 1; i < argc; i++)
|
||||
if (strcmp(argv[i], "--disable-seccomp") == 0)
|
||||
enable_seccomp = false;
|
||||
|
||||
/*
|
||||
* We deliberately delay the transition to the seccomp mode
|
||||
* until it's time to enter the main processing loop;
|
||||
* else we'd have to add a lot more syscalls to the allowlist.
|
||||
*/
|
||||
if (enable_seccomp)
|
||||
enter_seccomp_mode();
|
||||
#endif /* HAVE_LIBSECCOMP */
|
||||
|
||||
/*
|
||||
* Main processing loop
|
||||
*/
|
||||
MemoryContextSwitchTo(MessageContext);
|
||||
initStringInfo(&input_message);
|
||||
|
||||
for (;;)
|
||||
{
|
||||
/* Release memory left over from prior query cycle. */
|
||||
resetStringInfo(&input_message);
|
||||
|
||||
set_ps_display("idle");
|
||||
|
||||
/*
|
||||
* (3) read a command (loop blocks here)
|
||||
*/
|
||||
firstchar = ReadRedoCommand(&input_message);
|
||||
switch (firstchar)
|
||||
{
|
||||
case 'B': /* BeginRedoForBlock */
|
||||
BeginRedoForBlock(&input_message);
|
||||
break;
|
||||
|
||||
case 'P': /* PushPage */
|
||||
PushPage(&input_message);
|
||||
break;
|
||||
|
||||
case 'A': /* ApplyRecord */
|
||||
ApplyRecord(&input_message);
|
||||
break;
|
||||
|
||||
case 'G': /* GetPage */
|
||||
GetPage(&input_message);
|
||||
break;
|
||||
|
||||
/*
|
||||
* EOF means we're done. Perform normal shutdown.
|
||||
*/
|
||||
case EOF:
|
||||
ereport(LOG,
|
||||
(errmsg("received EOF on stdin, shutting down")));
|
||||
|
||||
#ifdef HAVE_LIBSECCOMP
|
||||
/*
|
||||
* Skip the shutdown sequence, leaving some garbage behind.
|
||||
* Hopefully, postgres will clean it up in the next run.
|
||||
* This way we don't have to enable extra syscalls, which is nice.
|
||||
* See enter_seccomp_mode() above.
|
||||
*/
|
||||
if (enable_seccomp)
|
||||
_exit(0);
|
||||
#endif /* HAVE_LIBSECCOMP */
|
||||
/*
|
||||
* NOTE: if you are tempted to add more code here, DON'T!
|
||||
* Whatever you had in mind to do should be set up as an
|
||||
* on_proc_exit or on_shmem_exit callback, instead. Otherwise
|
||||
* it will fail to be called during other backend-shutdown
|
||||
* scenarios.
|
||||
*/
|
||||
proc_exit(0);
|
||||
|
||||
default:
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_PROTOCOL_VIOLATION),
|
||||
errmsg("invalid frontend message type %d",
|
||||
firstchar)));
|
||||
}
|
||||
} /* end of input-reading loop */
|
||||
}
|
||||
|
||||
|
||||
/* Version compatility wrapper for ReadBufferWithoutRelcache */
|
||||
static inline Buffer
|
||||
NeonRedoReadBuffer(RelFileNode rnode,
|
||||
ForkNumber forkNum, BlockNumber blockNum,
|
||||
ReadBufferMode mode)
|
||||
{
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode,
|
||||
NULL, /* no strategy */
|
||||
true); /* WAL redo is only performed on permanent rels */
|
||||
#else
|
||||
return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode,
|
||||
NULL); /* no strategy */
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Some debug function that may be handy for now.
|
||||
*/
|
||||
pg_attribute_unused()
|
||||
static char *
|
||||
pprint_buffer(char *data, int len)
|
||||
{
|
||||
StringInfoData s;
|
||||
|
||||
initStringInfo(&s);
|
||||
appendStringInfo(&s, "\n");
|
||||
for (int i = 0; i < len; i++) {
|
||||
|
||||
appendStringInfo(&s, "%02x ", (*(((char *) data) + i) & 0xff) );
|
||||
if (i % 32 == 31) {
|
||||
appendStringInfo(&s, "\n");
|
||||
}
|
||||
}
|
||||
appendStringInfo(&s, "\n");
|
||||
|
||||
return s.data;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* routines to obtain user input
|
||||
* ----------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/*
|
||||
* Read next command from the client.
|
||||
*
|
||||
* the string entered by the user is placed in its parameter inBuf,
|
||||
* and we act like a Q message was received.
|
||||
*
|
||||
* EOF is returned if end-of-file input is seen; time to shut down.
|
||||
* ----------------
|
||||
*/
|
||||
static int
|
||||
ReadRedoCommand(StringInfo inBuf)
|
||||
{
|
||||
ssize_t ret;
|
||||
char hdr[1 + sizeof(int32)];
|
||||
int qtype;
|
||||
int32 len;
|
||||
|
||||
/* Read message type and message length */
|
||||
ret = buffered_read(hdr, sizeof(hdr));
|
||||
if (ret != sizeof(hdr))
|
||||
{
|
||||
if (ret == 0)
|
||||
return EOF;
|
||||
else if (ret < 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONNECTION_FAILURE),
|
||||
errmsg("could not read message header: %m")));
|
||||
else
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_PROTOCOL_VIOLATION),
|
||||
errmsg("unexpected EOF")));
|
||||
}
|
||||
|
||||
qtype = hdr[0];
|
||||
memcpy(&len, &hdr[1], sizeof(int32));
|
||||
len = pg_ntoh32(len);
|
||||
|
||||
if (len < 4)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_PROTOCOL_VIOLATION),
|
||||
errmsg("invalid message length")));
|
||||
|
||||
len -= 4; /* discount length itself */
|
||||
|
||||
/* Read the message payload */
|
||||
enlargeStringInfo(inBuf, len);
|
||||
ret = buffered_read(inBuf->data, len);
|
||||
if (ret != len)
|
||||
{
|
||||
if (ret < 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONNECTION_FAILURE),
|
||||
errmsg("could not read message: %m")));
|
||||
else
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_PROTOCOL_VIOLATION),
|
||||
errmsg("unexpected EOF")));
|
||||
}
|
||||
inBuf->len = len;
|
||||
inBuf->data[len] = '\0';
|
||||
|
||||
return qtype;
|
||||
}
|
||||
|
||||
/*
|
||||
* Prepare for WAL replay on given block
|
||||
*/
|
||||
static void
|
||||
BeginRedoForBlock(StringInfo input_message)
|
||||
{
|
||||
RelFileNode rnode;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blknum;
|
||||
SMgrRelation reln;
|
||||
|
||||
/*
|
||||
* message format:
|
||||
*
|
||||
* spcNode
|
||||
* dbNode
|
||||
* relNode
|
||||
* ForkNumber
|
||||
* BlockNumber
|
||||
*/
|
||||
forknum = pq_getmsgbyte(input_message);
|
||||
rnode.spcNode = pq_getmsgint(input_message, 4);
|
||||
rnode.dbNode = pq_getmsgint(input_message, 4);
|
||||
rnode.relNode = pq_getmsgint(input_message, 4);
|
||||
blknum = pq_getmsgint(input_message, 4);
|
||||
wal_redo_buffer = InvalidBuffer;
|
||||
|
||||
INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
|
||||
|
||||
elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u",
|
||||
target_redo_tag.rnode.spcNode,
|
||||
target_redo_tag.rnode.dbNode,
|
||||
target_redo_tag.rnode.relNode,
|
||||
target_redo_tag.forkNum,
|
||||
target_redo_tag.blockNum);
|
||||
|
||||
reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
|
||||
if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
|
||||
reln->smgr_cached_nblocks[forknum] < blknum + 1)
|
||||
{
|
||||
reln->smgr_cached_nblocks[forknum] = blknum + 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive a page given by the client, and put it into buffer cache.
|
||||
*/
|
||||
static void
|
||||
PushPage(StringInfo input_message)
|
||||
{
|
||||
RelFileNode rnode;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blknum;
|
||||
const char *content;
|
||||
Buffer buf;
|
||||
Page page;
|
||||
|
||||
/*
|
||||
* message format:
|
||||
*
|
||||
* spcNode
|
||||
* dbNode
|
||||
* relNode
|
||||
* ForkNumber
|
||||
* BlockNumber
|
||||
* 8k page content
|
||||
*/
|
||||
forknum = pq_getmsgbyte(input_message);
|
||||
rnode.spcNode = pq_getmsgint(input_message, 4);
|
||||
rnode.dbNode = pq_getmsgint(input_message, 4);
|
||||
rnode.relNode = pq_getmsgint(input_message, 4);
|
||||
blknum = pq_getmsgint(input_message, 4);
|
||||
content = pq_getmsgbytes(input_message, BLCKSZ);
|
||||
|
||||
buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_ZERO_AND_LOCK);
|
||||
wal_redo_buffer = buf;
|
||||
page = BufferGetPage(buf);
|
||||
memcpy(page, content, BLCKSZ);
|
||||
MarkBufferDirty(buf); /* pro forma */
|
||||
UnlockReleaseBuffer(buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive a WAL record, and apply it.
|
||||
*
|
||||
* All the pages should be loaded into the buffer cache by PushPage calls already.
|
||||
*/
|
||||
static void
|
||||
ApplyRecord(StringInfo input_message)
|
||||
{
|
||||
char *errormsg;
|
||||
XLogRecPtr lsn;
|
||||
XLogRecord *record;
|
||||
int nleft;
|
||||
ErrorContextCallback errcallback;
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
DecodedXLogRecord *decoded;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* message format:
|
||||
*
|
||||
* LSN (the *end* of the record)
|
||||
* record
|
||||
*/
|
||||
lsn = pq_getmsgint64(input_message);
|
||||
|
||||
smgrinit(); /* reset inmem smgr state */
|
||||
|
||||
/* note: the input must be aligned here */
|
||||
record = (XLogRecord *) pq_getmsgbytes(input_message, sizeof(XLogRecord));
|
||||
|
||||
nleft = input_message->len - input_message->cursor;
|
||||
if (record->xl_tot_len != sizeof(XLogRecord) + nleft)
|
||||
elog(ERROR, "mismatch between record (%d) and message size (%d)",
|
||||
record->xl_tot_len, (int) sizeof(XLogRecord) + nleft);
|
||||
|
||||
/* Setup error traceback support for ereport() */
|
||||
errcallback.callback = apply_error_callback;
|
||||
errcallback.arg = (void *) reader_state;
|
||||
errcallback.previous = error_context_stack;
|
||||
error_context_stack = &errcallback;
|
||||
|
||||
XLogBeginRead(reader_state, lsn);
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
decoded = (DecodedXLogRecord *) XLogReadRecordAlloc(reader_state, record->xl_tot_len, true);
|
||||
|
||||
if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg))
|
||||
elog(ERROR, "failed to decode WAL record: %s", errormsg);
|
||||
else
|
||||
{
|
||||
/* Record the location of the next record. */
|
||||
decoded->next_lsn = reader_state->NextRecPtr;
|
||||
|
||||
/*
|
||||
* If it's in the decode buffer, mark the decode buffer space as
|
||||
* occupied.
|
||||
*/
|
||||
if (!decoded->oversized)
|
||||
{
|
||||
/* The new decode buffer head must be MAXALIGNed. */
|
||||
Assert(decoded->size == MAXALIGN(decoded->size));
|
||||
if ((char *) decoded == reader_state->decode_buffer)
|
||||
reader_state->decode_buffer_tail = reader_state->decode_buffer + decoded->size;
|
||||
else
|
||||
reader_state->decode_buffer_tail += decoded->size;
|
||||
}
|
||||
|
||||
/* Insert it into the queue of decoded records. */
|
||||
Assert(reader_state->decode_queue_tail != decoded);
|
||||
if (reader_state->decode_queue_tail)
|
||||
reader_state->decode_queue_tail->next = decoded;
|
||||
reader_state->decode_queue_tail = decoded;
|
||||
if (!reader_state->decode_queue_head)
|
||||
reader_state->decode_queue_head = decoded;
|
||||
|
||||
/*
|
||||
* Update the pointers to the beginning and one-past-the-end of this
|
||||
* record, again for the benefit of historical code that expected the
|
||||
* decoder to track this rather than accessing these fields of the record
|
||||
* itself.
|
||||
*/
|
||||
reader_state->record = reader_state->decode_queue_head;
|
||||
reader_state->ReadRecPtr = reader_state->record->lsn;
|
||||
reader_state->EndRecPtr = reader_state->record->next_lsn;
|
||||
}
|
||||
#else
|
||||
/*
|
||||
* In lieu of calling XLogReadRecord, store the record 'decoded_record'
|
||||
* buffer directly.
|
||||
*/
|
||||
reader_state->ReadRecPtr = lsn;
|
||||
reader_state->decoded_record = record;
|
||||
if (!DecodeXLogRecord(reader_state, record, &errormsg))
|
||||
elog(ERROR, "failed to decode WAL record: %s", errormsg);
|
||||
#endif
|
||||
|
||||
/* Ignore any other blocks than the ones the caller is interested in */
|
||||
redo_read_buffer_filter = redo_block_filter;
|
||||
|
||||
RmgrTable[record->xl_rmid].rm_redo(reader_state);
|
||||
|
||||
/*
|
||||
* If no base image of the page was provided by PushPage, initialize
|
||||
* wal_redo_buffer here. The first WAL record must initialize the page
|
||||
* in that case.
|
||||
*/
|
||||
if (BufferIsInvalid(wal_redo_buffer))
|
||||
{
|
||||
wal_redo_buffer = NeonRedoReadBuffer(target_redo_tag.rnode,
|
||||
target_redo_tag.forkNum,
|
||||
target_redo_tag.blockNum,
|
||||
RBM_NORMAL);
|
||||
Assert(!BufferIsInvalid(wal_redo_buffer));
|
||||
ReleaseBuffer(wal_redo_buffer);
|
||||
}
|
||||
|
||||
redo_read_buffer_filter = NULL;
|
||||
|
||||
/* Pop the error context stack */
|
||||
error_context_stack = errcallback.previous;
|
||||
|
||||
elog(TRACE, "applied WAL record with LSN %X/%X",
|
||||
(uint32) (lsn >> 32), (uint32) lsn);
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
if (decoded && decoded->oversized)
|
||||
pfree(decoded);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Error context callback for errors occurring during ApplyRecord
|
||||
*/
|
||||
static void
|
||||
apply_error_callback(void *arg)
|
||||
{
|
||||
XLogReaderState *record = (XLogReaderState *) arg;
|
||||
StringInfoData buf;
|
||||
|
||||
initStringInfo(&buf);
|
||||
xlog_outdesc(&buf, record);
|
||||
|
||||
/* translator: %s is a WAL record description */
|
||||
errcontext("WAL redo at %X/%X for %s",
|
||||
LSN_FORMAT_ARGS(record->ReadRecPtr),
|
||||
buf.data);
|
||||
|
||||
pfree(buf.data);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static bool
|
||||
redo_block_filter(XLogReaderState *record, uint8 block_id)
|
||||
{
|
||||
BufferTag target_tag;
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
XLogRecGetBlockTag(record, block_id,
|
||||
&target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum);
|
||||
#else
|
||||
if (!XLogRecGetBlockTag(record, block_id,
|
||||
&target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum))
|
||||
{
|
||||
/* Caller specified a bogus block_id */
|
||||
elog(PANIC, "failed to locate backup block with ID %d", block_id);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Can a WAL redo function ever access a relation other than the one that
|
||||
* it modifies? I don't see why it would.
|
||||
*/
|
||||
if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode))
|
||||
elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
|
||||
target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum);
|
||||
|
||||
/*
|
||||
* If this block isn't one we are currently restoring, then return 'true'
|
||||
* so that this gets ignored
|
||||
*/
|
||||
return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get a page image back from buffer cache.
|
||||
*
|
||||
* After applying some records.
|
||||
*/
|
||||
static void
|
||||
GetPage(StringInfo input_message)
|
||||
{
|
||||
RelFileNode rnode;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blknum;
|
||||
Buffer buf;
|
||||
Page page;
|
||||
int tot_written;
|
||||
|
||||
/*
|
||||
* message format:
|
||||
*
|
||||
* spcNode
|
||||
* dbNode
|
||||
* relNode
|
||||
* ForkNumber
|
||||
* BlockNumber
|
||||
*/
|
||||
forknum = pq_getmsgbyte(input_message);
|
||||
rnode.spcNode = pq_getmsgint(input_message, 4);
|
||||
rnode.dbNode = pq_getmsgint(input_message, 4);
|
||||
rnode.relNode = pq_getmsgint(input_message, 4);
|
||||
blknum = pq_getmsgint(input_message, 4);
|
||||
|
||||
/* FIXME: check that we got a BeginRedoForBlock message or this earlier */
|
||||
|
||||
buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_NORMAL);
|
||||
Assert(buf == wal_redo_buffer);
|
||||
page = BufferGetPage(buf);
|
||||
/* single thread, so don't bother locking the page */
|
||||
|
||||
/* Response: Page content */
|
||||
tot_written = 0;
|
||||
do {
|
||||
ssize_t rc;
|
||||
|
||||
rc = write(STDOUT_FILENO, &page[tot_written], BLCKSZ - tot_written);
|
||||
if (rc < 0) {
|
||||
/* If interrupted by signal, just retry */
|
||||
if (errno == EINTR)
|
||||
continue;
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not write to stdout: %m")));
|
||||
}
|
||||
tot_written += rc;
|
||||
} while (tot_written < BLCKSZ);
|
||||
|
||||
ReleaseBuffer(buf);
|
||||
DropRelFileNodeAllLocalBuffers(rnode);
|
||||
wal_redo_buffer = InvalidBuffer;
|
||||
|
||||
elog(TRACE, "Page sent back for block %u", blknum);
|
||||
}
|
||||
|
||||
|
||||
/* Buffer used by buffered_read() */
|
||||
static char stdin_buf[16 * 1024];
|
||||
static size_t stdin_len = 0; /* # of bytes in buffer */
|
||||
static size_t stdin_ptr = 0; /* # of bytes already consumed */
|
||||
|
||||
/*
|
||||
* Like read() on stdin, but buffered.
|
||||
*
|
||||
* We cannot use libc's buffered fread(), because it uses syscalls that we
|
||||
* have disabled with seccomp(). Depending on the platform, it can call
|
||||
* 'fstat' or 'newfstatat'. 'fstat' is probably harmless, but 'newfstatat'
|
||||
* seems problematic because it allows interrogating files by path name.
|
||||
*
|
||||
* The return value is the number of bytes read. On error, -1 is returned, and
|
||||
* errno is set appropriately. Unlike read(), this fills the buffer completely
|
||||
* unless an error happens or EOF is reached.
|
||||
*/
|
||||
static ssize_t
|
||||
buffered_read(void *buf, size_t count)
|
||||
{
|
||||
char *dst = buf;
|
||||
|
||||
while (count > 0)
|
||||
{
|
||||
size_t nthis;
|
||||
|
||||
if (stdin_ptr == stdin_len)
|
||||
{
|
||||
ssize_t ret;
|
||||
|
||||
ret = read(STDIN_FILENO, stdin_buf, sizeof(stdin_buf));
|
||||
if (ret < 0)
|
||||
{
|
||||
/* don't do anything here that could set 'errno' */
|
||||
return ret;
|
||||
}
|
||||
if (ret == 0)
|
||||
{
|
||||
/* EOF */
|
||||
break;
|
||||
}
|
||||
stdin_len = (size_t) ret;
|
||||
stdin_ptr = 0;
|
||||
}
|
||||
nthis = Min(stdin_len - stdin_ptr, count);
|
||||
|
||||
memcpy(dst, &stdin_buf[stdin_ptr], nthis);
|
||||
|
||||
stdin_ptr += nthis;
|
||||
count -= nthis;
|
||||
dst += nthis;
|
||||
}
|
||||
|
||||
return (dst - (char *) buf);
|
||||
}
|
||||
85
poetry.lock
generated
85
poetry.lock
generated
@@ -11,7 +11,7 @@ async-timeout = ">=3.0,<5.0"
|
||||
psycopg2-binary = ">=2.8.4"
|
||||
|
||||
[package.extras]
|
||||
sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"]
|
||||
sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "allure-pytest"
|
||||
@@ -80,7 +80,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
|
||||
docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
|
||||
tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
|
||||
tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
|
||||
tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
|
||||
|
||||
[[package]]
|
||||
name = "aws-sam-translator"
|
||||
@@ -514,14 +514,6 @@ python-versions = ">=3.7"
|
||||
[package.dependencies]
|
||||
typing-extensions = ">=4.1.0"
|
||||
|
||||
[[package]]
|
||||
name = "cached-property"
|
||||
version = "1.5.2"
|
||||
description = "A decorator for caching properties in classes."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2022.6.15"
|
||||
@@ -568,7 +560,7 @@ optional = false
|
||||
python-versions = ">=3.6.0"
|
||||
|
||||
[package.extras]
|
||||
unicode_backport = ["unicodedata2"]
|
||||
unicode-backport = ["unicodedata2"]
|
||||
|
||||
[[package]]
|
||||
name = "click"
|
||||
@@ -601,7 +593,7 @@ python-versions = ">=3.6"
|
||||
cffi = ">=1.12"
|
||||
|
||||
[package.extras]
|
||||
docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"]
|
||||
docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx_rtd_theme"]
|
||||
docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
|
||||
pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"]
|
||||
sdist = ["setuptools_rust (>=0.11.4)"]
|
||||
@@ -746,9 +738,9 @@ python-versions = ">=3.6.1,<4.0"
|
||||
|
||||
[package.extras]
|
||||
colors = ["colorama (>=0.4.3,<0.5.0)"]
|
||||
pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
|
||||
pipfile-deprecated-finder = ["pipreqs", "requirementslib"]
|
||||
plugins = ["setuptools"]
|
||||
requirements_deprecated_finder = ["pip-api", "pipreqs"]
|
||||
requirements-deprecated-finder = ["pip-api", "pipreqs"]
|
||||
|
||||
[[package]]
|
||||
name = "itsdangerous"
|
||||
@@ -823,7 +815,7 @@ python-versions = ">=2.7"
|
||||
[package.extras]
|
||||
docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"]
|
||||
testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"]
|
||||
"testing.libs" = ["simplejson", "ujson", "yajl"]
|
||||
testing-libs = ["simplejson", "ujson", "yajl"]
|
||||
|
||||
[[package]]
|
||||
name = "jsonpointer"
|
||||
@@ -844,11 +836,12 @@ python-versions = "*"
|
||||
[package.dependencies]
|
||||
attrs = ">=17.4.0"
|
||||
pyrsistent = ">=0.14.0"
|
||||
setuptools = "*"
|
||||
six = ">=1.11.0"
|
||||
|
||||
[package.extras]
|
||||
format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
|
||||
format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
|
||||
format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
|
||||
|
||||
[[package]]
|
||||
name = "junit-xml"
|
||||
@@ -908,6 +901,7 @@ pytz = "*"
|
||||
PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
|
||||
requests = ">=2.5"
|
||||
responses = ">=0.9.0"
|
||||
setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
|
||||
sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
|
||||
werkzeug = ">=0.5,<2.2.0"
|
||||
xmltodict = "*"
|
||||
@@ -1016,6 +1010,7 @@ python-versions = ">=3.7.0,<4.0.0"
|
||||
jsonschema = ">=3.2.0,<5.0.0"
|
||||
openapi-schema-validator = ">=0.2.0,<0.3.0"
|
||||
PyYAML = ">=5.1"
|
||||
setuptools = "*"
|
||||
|
||||
[package.extras]
|
||||
requests = ["requests"]
|
||||
@@ -1348,7 +1343,7 @@ urllib3 = ">=1.21.1,<1.27"
|
||||
|
||||
[package.extras]
|
||||
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
|
||||
use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
|
||||
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
||||
|
||||
[[package]]
|
||||
name = "responses"
|
||||
@@ -1402,6 +1397,19 @@ python-versions = ">= 2.7"
|
||||
attrs = "*"
|
||||
pbr = "*"
|
||||
|
||||
[[package]]
|
||||
name = "setuptools"
|
||||
version = "65.5.0"
|
||||
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
|
||||
[package.extras]
|
||||
docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
|
||||
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
|
||||
testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
|
||||
|
||||
[[package]]
|
||||
name = "six"
|
||||
version = "1.16.0"
|
||||
@@ -1468,6 +1476,14 @@ category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7,<4.0"
|
||||
|
||||
[[package]]
|
||||
name = "types-toml"
|
||||
version = "0.10.8"
|
||||
description = "Typing stubs for toml"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "types-urllib3"
|
||||
version = "1.26.17"
|
||||
@@ -1552,7 +1568,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
|
||||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "ead1495454ee6d880bb240447025db93a25ebe263c2709de5f144cc2d85dc975"
|
||||
content-hash = "9352a89d49d34807f6a58f6c3f898acbd8cf3570e0f45ede973673644bde4d0e"
|
||||
|
||||
[metadata.files]
|
||||
aiopg = [
|
||||
@@ -1647,10 +1663,6 @@ botocore-stubs = [
|
||||
{file = "botocore-stubs-1.27.38.tar.gz", hash = "sha256:408e8b86b5d171b58f81c74ca9d3b5317a5a8e2d3bc2073aa841ac13b8939e56"},
|
||||
{file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"},
|
||||
]
|
||||
cached-property = [
|
||||
{file = "cached-property-1.5.2.tar.gz", hash = "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130"},
|
||||
{file = "cached_property-1.5.2-py2.py3-none-any.whl", hash = "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"},
|
||||
]
|
||||
certifi = [
|
||||
{file = "certifi-2022.6.15-py3-none-any.whl", hash = "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"},
|
||||
{file = "certifi-2022.6.15.tar.gz", hash = "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d"},
|
||||
@@ -1966,6 +1978,7 @@ prometheus-client = [
|
||||
psycopg2-binary = [
|
||||
{file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
|
||||
{file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
|
||||
{file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
|
||||
{file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
|
||||
{file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
|
||||
{file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
|
||||
@@ -1999,6 +2012,7 @@ psycopg2-binary = [
|
||||
{file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
|
||||
{file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
|
||||
@@ -2010,6 +2024,7 @@ psycopg2-binary = [
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
|
||||
{file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
|
||||
{file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
|
||||
{file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
|
||||
{file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
|
||||
{file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
|
||||
@@ -2026,18 +2041,7 @@ py = [
|
||||
{file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
|
||||
]
|
||||
pyasn1 = [
|
||||
{file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
|
||||
{file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
|
||||
{file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
|
||||
{file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
|
||||
{file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
|
||||
{file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
|
||||
{file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
|
||||
{file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
|
||||
{file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
|
||||
{file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
|
||||
{file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
|
||||
{file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
|
||||
{file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
|
||||
]
|
||||
pycodestyle = [
|
||||
@@ -2147,6 +2151,13 @@ pyyaml = [
|
||||
{file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
|
||||
{file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
|
||||
{file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
|
||||
{file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
|
||||
{file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
|
||||
{file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
|
||||
@@ -2194,6 +2205,10 @@ sarif-om = [
|
||||
{file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"},
|
||||
{file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
|
||||
]
|
||||
setuptools = [
|
||||
{file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
|
||||
{file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},
|
||||
]
|
||||
six = [
|
||||
{file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
|
||||
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
|
||||
@@ -2222,6 +2237,10 @@ types-s3transfer = [
|
||||
{file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"},
|
||||
{file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"},
|
||||
]
|
||||
types-toml = [
|
||||
{file = "types-toml-0.10.8.tar.gz", hash = "sha256:b7e7ea572308b1030dc86c3ba825c5210814c2825612ec679eb7814f8dd9295a"},
|
||||
{file = "types_toml-0.10.8-py3-none-any.whl", hash = "sha256:8300fd093e5829eb9c1fba69cee38130347d4b74ddf32d0a7df650ae55c2b599"},
|
||||
]
|
||||
types-urllib3 = [
|
||||
{file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"},
|
||||
{file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"},
|
||||
|
||||
@@ -14,7 +14,6 @@ requests = "^2.26.0"
|
||||
pytest-xdist = "^2.3.0"
|
||||
asyncpg = "^0.24.0"
|
||||
aiopg = "^1.3.1"
|
||||
cached-property = "^1.5.2"
|
||||
Jinja2 = "^3.0.2"
|
||||
types-requests = "^2.28.5"
|
||||
types-psycopg2 = "^2.9.18"
|
||||
@@ -29,12 +28,14 @@ Werkzeug = "2.1.2"
|
||||
pytest-order = "^1.0.1"
|
||||
allure-pytest = "^2.10.0"
|
||||
pytest-asyncio = "^0.19.0"
|
||||
toml = "^0.10.2"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
flake8 = "^5.0.4"
|
||||
mypy = "==0.971"
|
||||
black = "^22.6.0"
|
||||
isort = "^5.10.1"
|
||||
types-toml = "^0.10.8"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
@@ -74,7 +75,6 @@ strict = true
|
||||
[[tool.mypy.overrides]]
|
||||
module = [
|
||||
"asyncpg.*",
|
||||
"cached_property.*",
|
||||
"pg8000.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# version, we can consider updating.
|
||||
# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package,
|
||||
# we use "unstable" version number as the highest version used in the project by default.
|
||||
channel = "1.61" # do update GitHub CI cache values for rust builds, when changing this value
|
||||
channel = "1.62.1" # do update GitHub CI cache values for rust builds, when changing this value
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -12,7 +12,7 @@ fs2 = "0.4.3"
|
||||
serde_json = "1"
|
||||
tracing = "0.1.27"
|
||||
clap = "4.0"
|
||||
daemonize = "0.4.1"
|
||||
nix = "0.25"
|
||||
tokio = { version = "1.17", features = ["macros", "fs"] }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
|
||||
@@ -4,8 +4,7 @@
|
||||
use anyhow::{bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, Command};
|
||||
use const_format::formatcp;
|
||||
use daemonize::Daemonize;
|
||||
use fs2::FileExt;
|
||||
use nix::unistd::Pid;
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{ErrorKind, Write};
|
||||
@@ -16,12 +15,14 @@ use tokio::sync::mpsc;
|
||||
use toml_edit::Document;
|
||||
use tracing::*;
|
||||
use url::{ParseError, Url};
|
||||
use utils::lock_file;
|
||||
|
||||
use metrics::set_build_info_metric;
|
||||
use safekeeper::broker;
|
||||
use safekeeper::control_file;
|
||||
use safekeeper::defaults::{
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
|
||||
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
|
||||
};
|
||||
use safekeeper::http;
|
||||
use safekeeper::remove_wal;
|
||||
@@ -31,11 +32,13 @@ use safekeeper::GlobalTimelines;
|
||||
use safekeeper::SafeKeeperConf;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::{
|
||||
http::endpoint, id::NodeId, logging, project_git_version, shutdown::exit_now, signals,
|
||||
tcp_listener,
|
||||
http::endpoint,
|
||||
id::NodeId,
|
||||
logging::{self, LogFormat},
|
||||
project_git_version, signals, tcp_listener,
|
||||
};
|
||||
|
||||
const LOCK_FILE_NAME: &str = "safekeeper.lock";
|
||||
const PID_FILE_NAME: &str = "safekeeper.pid";
|
||||
const ID_FILE_NAME: &str = "safekeeper.id";
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
@@ -60,10 +63,6 @@ fn main() -> anyhow::Result<()> {
|
||||
conf.no_sync = true;
|
||||
}
|
||||
|
||||
if arg_matches.get_flag("daemonize") {
|
||||
conf.daemonize = true;
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.get_one::<String>("listen-pg") {
|
||||
conf.listen_pg_addr = addr.to_string();
|
||||
}
|
||||
@@ -72,10 +71,6 @@ fn main() -> anyhow::Result<()> {
|
||||
conf.listen_http_addr = addr.to_string();
|
||||
}
|
||||
|
||||
if let Some(recall) = arg_matches.get_one::<String>("recall") {
|
||||
conf.recall_period = humantime::parse_duration(recall)?;
|
||||
}
|
||||
|
||||
let mut given_id = None;
|
||||
if let Some(given_id_str) = arg_matches.get_one::<String>("id") {
|
||||
given_id = Some(NodeId(
|
||||
@@ -93,6 +88,16 @@ fn main() -> anyhow::Result<()> {
|
||||
conf.broker_etcd_prefix = prefix.to_string();
|
||||
}
|
||||
|
||||
if let Some(heartbeat_timeout_str) = arg_matches.get_one::<String>("heartbeat-timeout") {
|
||||
conf.heartbeat_timeout =
|
||||
humantime::parse_duration(heartbeat_timeout_str).with_context(|| {
|
||||
format!(
|
||||
"failed to parse heartbeat-timeout {}",
|
||||
heartbeat_timeout_str
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
if let Some(backup_threads) = arg_matches.get_one::<String>("wal-backup-threads") {
|
||||
conf.backup_runtime_threads = backup_threads
|
||||
.parse()
|
||||
@@ -105,6 +110,14 @@ fn main() -> anyhow::Result<()> {
|
||||
let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again
|
||||
conf.remote_storage = Some(RemoteStorageConfig::from_toml(storage_conf_parsed_toml)?);
|
||||
}
|
||||
if let Some(max_offloader_lag_str) = arg_matches.get_one::<String>("max-offloader-lag") {
|
||||
conf.max_offloader_lag_bytes = max_offloader_lag_str.parse().with_context(|| {
|
||||
format!(
|
||||
"failed to parse max offloader lag {}",
|
||||
max_offloader_lag_str
|
||||
)
|
||||
})?;
|
||||
}
|
||||
// Seems like there is no better way to accept bool values explicitly in clap.
|
||||
conf.wal_backup_enabled = arg_matches
|
||||
.get_one::<String>("enable-wal-backup")
|
||||
@@ -116,23 +129,41 @@ fn main() -> anyhow::Result<()> {
|
||||
.get_one::<String>("auth-validation-public-key-path")
|
||||
.map(PathBuf::from);
|
||||
|
||||
if let Some(log_format) = arg_matches.get_one::<String>("log-format") {
|
||||
conf.log_format = LogFormat::from_config(log_format)?;
|
||||
}
|
||||
|
||||
start_safekeeper(conf, given_id, arg_matches.get_flag("init"))
|
||||
}
|
||||
|
||||
fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bool) -> Result<()> {
|
||||
let log_file = logging::init("safekeeper.log", conf.daemonize)?;
|
||||
|
||||
logging::init(conf.log_format)?;
|
||||
info!("version: {GIT_VERSION}");
|
||||
|
||||
// Prevent running multiple safekeepers on the same directory
|
||||
let lock_file_path = conf.workdir.join(LOCK_FILE_NAME);
|
||||
let lock_file = File::create(&lock_file_path).context("failed to open lockfile")?;
|
||||
lock_file.try_lock_exclusive().with_context(|| {
|
||||
format!(
|
||||
"control file {} is locked by some other process",
|
||||
lock_file_path.display()
|
||||
)
|
||||
})?;
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
|
||||
lock_file::LockCreationResult::Created {
|
||||
new_lock_contents,
|
||||
file,
|
||||
} => {
|
||||
info!("Created lock file at {lock_file_path:?} with contents {new_lock_contents}");
|
||||
file
|
||||
}
|
||||
lock_file::LockCreationResult::AlreadyLocked {
|
||||
existing_lock_contents,
|
||||
} => anyhow::bail!(
|
||||
"Could not lock pid file; safekeeper is already running in {:?} with PID {}",
|
||||
conf.workdir,
|
||||
existing_lock_contents
|
||||
),
|
||||
lock_file::LockCreationResult::CreationFailed(e) => {
|
||||
return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
|
||||
}
|
||||
};
|
||||
// ensure that the lock file is held even if the main thread of the process is panics
|
||||
// we need to release the lock file only when the current process is gone
|
||||
let _ = Box::leak(Box::new(lock_file));
|
||||
|
||||
// Set or read our ID.
|
||||
set_id(&mut conf, given_id)?;
|
||||
@@ -164,31 +195,6 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
||||
}
|
||||
};
|
||||
|
||||
// XXX: Don't spawn any threads before daemonizing!
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
let stdout = log_file.try_clone().unwrap();
|
||||
let stderr = log_file;
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file("safekeeper.pid")
|
||||
.working_directory(Path::new("."))
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
|
||||
// XXX: The parent process should exit abruptly right after
|
||||
// it has spawned a child to prevent coverage machinery from
|
||||
// dumping stats into a `profraw` file now owned by the child.
|
||||
// Otherwise, the coverage data will be damaged.
|
||||
match daemonize.exit_action(|| exit_now(0)).start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(err) => bail!("Error: {err}. could not daemonize. bailing."),
|
||||
}
|
||||
}
|
||||
|
||||
// Register metrics collector for active timelines. It's important to do this
|
||||
// after daemonizing, otherwise process collector will be upset.
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
|
||||
@@ -361,18 +367,6 @@ fn cli() -> Command {
|
||||
.short('p')
|
||||
.long("pageserver"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("recall")
|
||||
.long("recall")
|
||||
.help("Period for requestion pageserver to call for replication"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("daemonize")
|
||||
.short('d')
|
||||
.long("daemonize")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Run in the background"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("no-sync")
|
||||
.short('n')
|
||||
@@ -397,6 +391,11 @@ fn cli() -> Command {
|
||||
.long("broker-etcd-prefix")
|
||||
.help("a prefix to always use when polling/pusing data in etcd from this safekeeper"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("heartbeat-timeout")
|
||||
.long("heartbeat-timeout")
|
||||
.help(formatcp!("Peer is considered dead after not receiving heartbeats from it during this period (default {}s), passed as a human readable duration.", DEFAULT_HEARTBEAT_TIMEOUT.as_secs()))
|
||||
)
|
||||
.arg(
|
||||
Arg::new("wal-backup-threads").long("backup-threads").help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")),
|
||||
).arg(
|
||||
@@ -404,6 +403,11 @@ fn cli() -> Command {
|
||||
.long("remote-storage")
|
||||
.help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"<BUCKETNAME>\", \"bucket_region\":\"<REGION>\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]<tenant_id>/<timeline_id>/<segment_file>, mirroring structure on the file system.")
|
||||
)
|
||||
.arg(
|
||||
Arg::new("max-offloader-lag")
|
||||
.long("max-offloader-lag")
|
||||
.help(formatcp!("Safekeeper won't be elected for WAL offloading if it is lagging for more than this value (default {}MB) in bytes", DEFAULT_MAX_OFFLOADER_LAG_BYTES / (1 << 20)))
|
||||
)
|
||||
.arg(
|
||||
Arg::new("enable-wal-backup")
|
||||
.long("enable-wal-backup")
|
||||
@@ -416,6 +420,11 @@ fn cli() -> Command {
|
||||
.long("auth-validation-public-key-path")
|
||||
.help("Path to an RSA .pem public key which is used to check JWT tokens")
|
||||
)
|
||||
.arg(
|
||||
Arg::new("log-format")
|
||||
.long("log-format")
|
||||
.help("Format for logging, either 'plain' or 'json'")
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
//! Communication with etcd, providing safekeeper peers and pageserver coordination.
|
||||
|
||||
use anyhow::anyhow;
|
||||
use anyhow::Context;
|
||||
use anyhow::Error;
|
||||
use anyhow::Result;
|
||||
@@ -12,11 +11,9 @@ use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::time::Duration;
|
||||
use tokio::spawn;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::{runtime, time::sleep};
|
||||
use tracing::*;
|
||||
use url::Url;
|
||||
|
||||
use crate::GlobalTimelines;
|
||||
use crate::SafeKeeperConf;
|
||||
@@ -56,113 +53,6 @@ fn timeline_safekeeper_path(
|
||||
)
|
||||
}
|
||||
|
||||
pub struct Election {
|
||||
pub election_name: String,
|
||||
pub candidate_name: String,
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
}
|
||||
|
||||
impl Election {
|
||||
pub fn new(election_name: String, candidate_name: String, broker_endpoints: Vec<Url>) -> Self {
|
||||
Self {
|
||||
election_name,
|
||||
candidate_name,
|
||||
broker_endpoints,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ElectionLeader {
|
||||
client: Client,
|
||||
keep_alive: JoinHandle<Result<()>>,
|
||||
}
|
||||
|
||||
impl ElectionLeader {
|
||||
pub async fn check_am_i(
|
||||
&mut self,
|
||||
election_name: String,
|
||||
candidate_name: String,
|
||||
) -> Result<bool> {
|
||||
let resp = self.client.leader(election_name).await?;
|
||||
|
||||
let kv = resp
|
||||
.kv()
|
||||
.ok_or_else(|| anyhow!("failed to get leader response"))?;
|
||||
let leader = kv.value_str()?;
|
||||
|
||||
Ok(leader == candidate_name)
|
||||
}
|
||||
|
||||
pub async fn give_up(self) {
|
||||
self.keep_alive.abort();
|
||||
// TODO: it'll be wise to resign here but it'll happen after lease expiration anyway
|
||||
// should we await for keep alive termination?
|
||||
let _ = self.keep_alive.await;
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_leader(req: &Election, leader: &mut Option<ElectionLeader>) -> Result<()> {
|
||||
let mut client = Client::connect(req.broker_endpoints.clone(), None)
|
||||
.await
|
||||
.context("Could not connect to etcd")?;
|
||||
|
||||
let lease = client
|
||||
.lease_grant(LEASE_TTL_SEC, None)
|
||||
.await
|
||||
.context("Could not acquire a lease");
|
||||
|
||||
let lease_id = lease.map(|l| l.id()).unwrap();
|
||||
|
||||
// kill previous keepalive, if any
|
||||
if let Some(l) = leader.take() {
|
||||
l.give_up().await;
|
||||
}
|
||||
|
||||
let keep_alive = spawn::<_>(lease_keep_alive(client.clone(), lease_id));
|
||||
// immediately save handle to kill task if we get canceled below
|
||||
*leader = Some(ElectionLeader {
|
||||
client: client.clone(),
|
||||
keep_alive,
|
||||
});
|
||||
|
||||
client
|
||||
.campaign(
|
||||
req.election_name.clone(),
|
||||
req.candidate_name.clone(),
|
||||
lease_id,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> {
|
||||
let (mut keeper, mut ka_stream) = client
|
||||
.lease_keep_alive(lease_id)
|
||||
.await
|
||||
.context("failed to create keepalive stream")?;
|
||||
|
||||
loop {
|
||||
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
|
||||
|
||||
keeper
|
||||
.keep_alive()
|
||||
.await
|
||||
.context("failed to send LeaseKeepAliveRequest")?;
|
||||
|
||||
ka_stream
|
||||
.message()
|
||||
.await
|
||||
.context("failed to receive LeaseKeepAliveResponse")?;
|
||||
|
||||
sleep(push_interval).await;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_candiate_name(system_id: NodeId) -> String {
|
||||
format!("id_{system_id}")
|
||||
}
|
||||
|
||||
async fn push_sk_info(
|
||||
ttid: TenantTimelineId,
|
||||
mut client: Client,
|
||||
@@ -236,7 +126,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
let handles = active_tlis
|
||||
.iter()
|
||||
.map(|tli| {
|
||||
let sk_info = tli.get_public_info(&conf);
|
||||
let sk_info = tli.get_safekeeper_info(&conf);
|
||||
let key =
|
||||
timeline_safekeeper_path(conf.broker_etcd_prefix.clone(), tli.ttid, conf.my_id);
|
||||
let lease = leases.remove(&tli.ttid).unwrap();
|
||||
@@ -282,6 +172,9 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
|
||||
Some(new_info) => {
|
||||
// note: there are blocking operations below, but it's considered fine for now
|
||||
if let Ok(tli) = GlobalTimelines::get(new_info.key.id) {
|
||||
// Note that we also receive *our own* info. That's
|
||||
// important, as it is used as an indication of live
|
||||
// connection to the broker.
|
||||
tli.record_safekeeper_info(&new_info.value, new_info.key.node_id)
|
||||
.await?
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//! Code to deal with safekeeper control file upgrades
|
||||
use crate::safekeeper::{
|
||||
AcceptorState, Peers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry,
|
||||
AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory,
|
||||
TermSwitchEntry,
|
||||
};
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -134,7 +135,7 @@ pub struct SafeKeeperStateV4 {
|
||||
// fundamental; but state is saved here only for informational purposes and
|
||||
// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
// place to have less file version upgrades).
|
||||
pub peers: Peers,
|
||||
pub peers: PersistedPeers,
|
||||
}
|
||||
|
||||
pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState> {
|
||||
@@ -165,7 +166,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
||||
backup_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: Peers(vec![]),
|
||||
peers: PersistedPeers(vec![]),
|
||||
});
|
||||
// migrate to hexing some ids
|
||||
} else if version == 2 {
|
||||
@@ -188,7 +189,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
||||
backup_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: Peers(vec![]),
|
||||
peers: PersistedPeers(vec![]),
|
||||
});
|
||||
// migrate to moving tenant_id/timeline_id to the top and adding some lsns
|
||||
} else if version == 3 {
|
||||
@@ -211,7 +212,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
||||
backup_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: Peers(vec![]),
|
||||
peers: PersistedPeers(vec![]),
|
||||
});
|
||||
// migrate to having timeline_start_lsn
|
||||
} else if version == 4 {
|
||||
@@ -234,7 +235,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
||||
backup_lsn: Lsn::INVALID,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: Peers(vec![]),
|
||||
peers: PersistedPeers(vec![]),
|
||||
});
|
||||
} else if version == 5 {
|
||||
info!("reading safekeeper control file version {}", version);
|
||||
|
||||
@@ -1,11 +1,16 @@
|
||||
use defaults::DEFAULT_WAL_BACKUP_RUNTIME_THREADS;
|
||||
use defaults::{
|
||||
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
|
||||
};
|
||||
//
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use url::Url;
|
||||
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId};
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TenantTimelineId},
|
||||
logging::LogFormat,
|
||||
};
|
||||
|
||||
pub mod broker;
|
||||
pub mod control_file;
|
||||
@@ -34,8 +39,9 @@ pub mod defaults {
|
||||
DEFAULT_PG_LISTEN_PORT,
|
||||
};
|
||||
|
||||
pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10);
|
||||
pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8;
|
||||
pub const DEFAULT_HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -48,11 +54,9 @@ pub struct SafeKeeperConf {
|
||||
// data directories to avoid clashing with each other.
|
||||
pub workdir: PathBuf,
|
||||
|
||||
pub daemonize: bool,
|
||||
pub no_sync: bool,
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub recall_period: Duration,
|
||||
pub remote_storage: Option<RemoteStorageConfig>,
|
||||
pub backup_runtime_threads: usize,
|
||||
pub wal_backup_enabled: bool,
|
||||
@@ -60,6 +64,9 @@ pub struct SafeKeeperConf {
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
pub broker_etcd_prefix: String,
|
||||
pub auth_validation_public_key_path: Option<PathBuf>,
|
||||
pub heartbeat_timeout: Duration,
|
||||
pub max_offloader_lag_bytes: u64,
|
||||
pub log_format: LogFormat,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
@@ -80,18 +87,19 @@ impl Default for SafeKeeperConf {
|
||||
// command line, so that when the server is running, all paths are relative
|
||||
// to that.
|
||||
workdir: PathBuf::from("./"),
|
||||
daemonize: false,
|
||||
no_sync: false,
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
remote_storage: None,
|
||||
recall_period: defaults::DEFAULT_RECALL_PERIOD,
|
||||
my_id: NodeId(0),
|
||||
broker_endpoints: Vec::new(),
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
|
||||
wal_backup_enabled: true,
|
||||
auth_validation_public_key_path: None,
|
||||
heartbeat_timeout: DEFAULT_HEARTBEAT_TIMEOUT,
|
||||
max_offloader_lag_bytes: DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
log_format: LogFormat::Plain,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ use std::cmp::max;
|
||||
use std::cmp::min;
|
||||
use std::fmt;
|
||||
use std::io::Read;
|
||||
|
||||
use tracing::*;
|
||||
|
||||
use crate::control_file;
|
||||
@@ -132,9 +133,8 @@ pub struct ServerInfo {
|
||||
pub wal_seg_size: u32,
|
||||
}
|
||||
|
||||
/// Data published by safekeeper to the peers
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PeerInfo {
|
||||
pub struct PersistedPeerInfo {
|
||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
||||
backup_lsn: Lsn,
|
||||
/// Term of the last entry.
|
||||
@@ -145,7 +145,7 @@ pub struct PeerInfo {
|
||||
commit_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl PeerInfo {
|
||||
impl PersistedPeerInfo {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
backup_lsn: Lsn::INVALID,
|
||||
@@ -156,10 +156,8 @@ impl PeerInfo {
|
||||
}
|
||||
}
|
||||
|
||||
// vector-based node id -> peer state map with very limited functionality we
|
||||
// need/
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Peers(pub Vec<(NodeId, PeerInfo)>);
|
||||
pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
|
||||
|
||||
/// Persistent information stored on safekeeper node
|
||||
/// On disk data is prefixed by magic and format version and followed by checksum.
|
||||
@@ -203,7 +201,7 @@ pub struct SafeKeeperState {
|
||||
// fundamental; but state is saved here only for informational purposes and
|
||||
// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
// place to have less file version upgrades).
|
||||
pub peers: Peers,
|
||||
pub peers: PersistedPeers,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -240,7 +238,12 @@ impl SafeKeeperState {
|
||||
backup_lsn: local_start_lsn,
|
||||
peer_horizon_lsn: local_start_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()),
|
||||
peers: PersistedPeers(
|
||||
peers
|
||||
.iter()
|
||||
.map(|p| (*p, PersistedPeerInfo::new()))
|
||||
.collect(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ use etcd_broker::subscription_value::SkTimelineInfo;
|
||||
|
||||
use postgres_ffi::XLogSegNo;
|
||||
|
||||
use tokio::sync::watch;
|
||||
use tokio::{sync::watch, time::Instant};
|
||||
|
||||
use std::cmp::{max, min};
|
||||
|
||||
@@ -26,7 +26,7 @@ use utils::{
|
||||
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
|
||||
SafekeeperMemState, ServerInfo,
|
||||
SafekeeperMemState, ServerInfo, Term,
|
||||
};
|
||||
use crate::send_wal::HotStandbyFeedback;
|
||||
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
||||
@@ -36,6 +36,53 @@ use crate::wal_storage;
|
||||
use crate::wal_storage::Storage as wal_storage_iface;
|
||||
use crate::SafeKeeperConf;
|
||||
|
||||
/// Things safekeeper should know about timeline state on peers.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PeerInfo {
|
||||
pub sk_id: NodeId,
|
||||
/// Term of the last entry.
|
||||
_last_log_term: Term,
|
||||
/// LSN of the last record.
|
||||
_flush_lsn: Lsn,
|
||||
pub commit_lsn: Lsn,
|
||||
/// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
|
||||
/// sk since backup_lsn.
|
||||
pub local_start_lsn: Lsn,
|
||||
/// When info was received.
|
||||
ts: Instant,
|
||||
}
|
||||
|
||||
impl PeerInfo {
|
||||
fn from_sk_info(sk_id: NodeId, sk_info: &SkTimelineInfo, ts: Instant) -> PeerInfo {
|
||||
PeerInfo {
|
||||
sk_id,
|
||||
_last_log_term: sk_info.last_log_term.unwrap_or(0),
|
||||
_flush_lsn: sk_info.flush_lsn.unwrap_or(Lsn::INVALID),
|
||||
commit_lsn: sk_info.commit_lsn.unwrap_or(Lsn::INVALID),
|
||||
local_start_lsn: sk_info.local_start_lsn.unwrap_or(Lsn::INVALID),
|
||||
ts,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// vector-based node id -> peer state map with very limited functionality we
|
||||
// need.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PeersInfo(pub Vec<PeerInfo>);
|
||||
|
||||
impl PeersInfo {
|
||||
fn get(&mut self, id: NodeId) -> Option<&mut PeerInfo> {
|
||||
self.0.iter_mut().find(|p| p.sk_id == id)
|
||||
}
|
||||
|
||||
fn upsert(&mut self, p: &PeerInfo) {
|
||||
match self.get(p.sk_id) {
|
||||
Some(rp) => *rp = p.clone(),
|
||||
None => self.0.push(p.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Replica status update + hot standby feedback
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ReplicaState {
|
||||
@@ -74,6 +121,8 @@ impl ReplicaState {
|
||||
pub struct SharedState {
|
||||
/// Safekeeper object
|
||||
sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
|
||||
/// In memory list containing state of peers sent in latest messages from them.
|
||||
peers_info: PeersInfo,
|
||||
/// State of replicas
|
||||
replicas: Vec<Option<ReplicaState>>,
|
||||
/// True when WAL backup launcher oversees the timeline, making sure WAL is
|
||||
@@ -123,7 +172,8 @@ impl SharedState {
|
||||
|
||||
Ok(Self {
|
||||
sk,
|
||||
replicas: Vec::new(),
|
||||
peers_info: PeersInfo(vec![]),
|
||||
replicas: vec![],
|
||||
wal_backup_active: false,
|
||||
active: false,
|
||||
num_computes: 0,
|
||||
@@ -142,6 +192,7 @@ impl SharedState {
|
||||
|
||||
Ok(Self {
|
||||
sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
|
||||
peers_info: PeersInfo(vec![]),
|
||||
replicas: Vec::new(),
|
||||
wal_backup_active: false,
|
||||
active: false,
|
||||
@@ -201,12 +252,6 @@ impl SharedState {
|
||||
self.wal_backup_active
|
||||
}
|
||||
|
||||
// Can this safekeeper offload to s3? Recently joined safekeepers might not
|
||||
// have necessary WAL.
|
||||
fn can_wal_backup(&self) -> bool {
|
||||
self.sk.state.local_start_lsn <= self.sk.inmem.backup_lsn
|
||||
}
|
||||
|
||||
fn get_wal_seg_size(&self) -> usize {
|
||||
self.sk.state.server.wal_seg_size as usize
|
||||
}
|
||||
@@ -268,6 +313,24 @@ impl SharedState {
|
||||
self.replicas.push(Some(state));
|
||||
pos
|
||||
}
|
||||
|
||||
fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
|
||||
SkTimelineInfo {
|
||||
last_log_term: Some(self.sk.get_epoch()),
|
||||
flush_lsn: Some(self.sk.wal_store.flush_lsn()),
|
||||
// note: this value is not flushed to control file yet and can be lost
|
||||
commit_lsn: Some(self.sk.inmem.commit_lsn),
|
||||
// TODO: rework feedbacks to avoid max here
|
||||
remote_consistent_lsn: Some(max(
|
||||
self.get_replicas_state().remote_consistent_lsn,
|
||||
self.sk.inmem.remote_consistent_lsn,
|
||||
)),
|
||||
peer_horizon_lsn: Some(self.sk.inmem.peer_horizon_lsn),
|
||||
safekeeper_connstr: Some(conf.listen_pg_addr.clone()),
|
||||
backup_lsn: Some(self.sk.inmem.backup_lsn),
|
||||
local_start_lsn: Some(self.sk.state.local_start_lsn),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -517,17 +580,6 @@ impl Timeline {
|
||||
self.write_shared_state().wal_backup_attend()
|
||||
}
|
||||
|
||||
/// Can this safekeeper offload to s3? Recently joined safekeepers might not
|
||||
/// have necessary WAL.
|
||||
pub fn can_wal_backup(&self) -> bool {
|
||||
if self.is_cancelled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let shared_state = self.write_shared_state();
|
||||
shared_state.can_wal_backup()
|
||||
}
|
||||
|
||||
/// Returns full timeline info, required for the metrics. If the timeline is
|
||||
/// not active, returns None instead.
|
||||
pub fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
|
||||
@@ -632,36 +684,25 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return public safekeeper info for broadcasting to broker and other peers.
|
||||
pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
|
||||
/// Get safekeeper info for broadcasting to broker and other peers.
|
||||
pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
|
||||
let shared_state = self.write_shared_state();
|
||||
SkTimelineInfo {
|
||||
last_log_term: Some(shared_state.sk.get_epoch()),
|
||||
flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()),
|
||||
// note: this value is not flushed to control file yet and can be lost
|
||||
commit_lsn: Some(shared_state.sk.inmem.commit_lsn),
|
||||
// TODO: rework feedbacks to avoid max here
|
||||
remote_consistent_lsn: Some(max(
|
||||
shared_state.get_replicas_state().remote_consistent_lsn,
|
||||
shared_state.sk.inmem.remote_consistent_lsn,
|
||||
)),
|
||||
peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn),
|
||||
safekeeper_connstr: Some(conf.listen_pg_addr.clone()),
|
||||
backup_lsn: Some(shared_state.sk.inmem.backup_lsn),
|
||||
}
|
||||
shared_state.get_safekeeper_info(conf)
|
||||
}
|
||||
|
||||
/// Update timeline state with peer safekeeper data.
|
||||
pub async fn record_safekeeper_info(
|
||||
&self,
|
||||
sk_info: &SkTimelineInfo,
|
||||
_sk_id: NodeId,
|
||||
sk_id: NodeId,
|
||||
) -> Result<()> {
|
||||
let is_wal_backup_action_pending: bool;
|
||||
let commit_lsn: Lsn;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state();
|
||||
shared_state.sk.record_safekeeper_info(sk_info)?;
|
||||
let peer_info = PeerInfo::from_sk_info(sk_id, sk_info, Instant::now());
|
||||
shared_state.peers_info.upsert(&peer_info);
|
||||
is_wal_backup_action_pending = shared_state.update_status(self.ttid);
|
||||
commit_lsn = shared_state.sk.inmem.commit_lsn;
|
||||
}
|
||||
@@ -673,6 +714,22 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get our latest view of alive peers status on the timeline.
|
||||
/// We pass our own info through the broker as well, so when we don't have connection
|
||||
/// to the broker returned vec is empty.
|
||||
pub fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
|
||||
let shared_state = self.write_shared_state();
|
||||
let now = Instant::now();
|
||||
shared_state
|
||||
.peers_info
|
||||
.0
|
||||
.iter()
|
||||
// Regard peer as absent if we haven't heard from it within heartbeat_timeout.
|
||||
.filter(|p| now.duration_since(p.ts) <= conf.heartbeat_timeout)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Add send_wal replica to the in-memory vector of replicas.
|
||||
pub fn add_replica(&self, state: ReplicaState) -> usize {
|
||||
self.write_shared_state().add_replica(state)
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use anyhow::{Context, Result};
|
||||
use etcd_broker::subscription_key::{
|
||||
NodeKind, OperationKind, SkOperationKind, SubscriptionKey, SubscriptionKind,
|
||||
};
|
||||
|
||||
use tokio::task::JoinHandle;
|
||||
use utils::id::NodeId;
|
||||
|
||||
use std::cmp::min;
|
||||
use std::collections::HashMap;
|
||||
@@ -26,14 +25,11 @@ use tracing::*;
|
||||
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
use crate::broker::{Election, ElectionLeader};
|
||||
use crate::timeline::Timeline;
|
||||
use crate::{broker, GlobalTimelines, SafeKeeperConf};
|
||||
use crate::timeline::{PeerInfo, Timeline};
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
|
||||
const BROKER_CONNECTION_RETRY_DELAY_MS: u64 = 1000;
|
||||
|
||||
const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
|
||||
const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
|
||||
|
||||
@@ -70,47 +66,104 @@ struct WalBackupTimelineEntry {
|
||||
handle: Option<WalBackupTaskHandle>,
|
||||
}
|
||||
|
||||
/// Start per timeline task, if it makes sense for this safekeeper to offload.
|
||||
fn consider_start_task(
|
||||
async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) {
|
||||
if let Some(wb_handle) = entry.handle.take() {
|
||||
// Tell the task to shutdown. Error means task exited earlier, that's ok.
|
||||
let _ = wb_handle.shutdown_tx.send(()).await;
|
||||
// Await the task itself. TODO: restart panicked tasks earlier.
|
||||
if let Err(e) = wb_handle.handle.await {
|
||||
warn!("WAL backup task for {} panicked: {}", ttid, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The goal is to ensure that normally only one safekeepers offloads. However,
|
||||
/// it is fine (and inevitable, as s3 doesn't provide CAS) that for some short
|
||||
/// time we have several ones as they PUT the same files. Also,
|
||||
/// - frequently changing the offloader would be bad;
|
||||
/// - electing seriously lagging safekeeper is undesirable;
|
||||
/// So we deterministically choose among the reasonably caught up candidates.
|
||||
/// TODO: take into account failed attempts to deal with hypothetical situation
|
||||
/// where s3 is unreachable only for some sks.
|
||||
fn determine_offloader(
|
||||
alive_peers: &[PeerInfo],
|
||||
wal_backup_lsn: Lsn,
|
||||
ttid: TenantTimelineId,
|
||||
conf: &SafeKeeperConf,
|
||||
) -> (Option<NodeId>, String) {
|
||||
// TODO: remove this once we fill newly joined safekeepers since backup_lsn.
|
||||
let capable_peers = alive_peers
|
||||
.iter()
|
||||
.filter(|p| p.local_start_lsn <= wal_backup_lsn);
|
||||
match capable_peers.clone().map(|p| p.commit_lsn).max() {
|
||||
None => (None, "no connected peers to elect from".to_string()),
|
||||
Some(max_commit_lsn) => {
|
||||
let threshold = max_commit_lsn
|
||||
.checked_sub(conf.max_offloader_lag_bytes)
|
||||
.unwrap_or(Lsn(0));
|
||||
let mut caughtup_peers = capable_peers
|
||||
.clone()
|
||||
.filter(|p| p.commit_lsn >= threshold)
|
||||
.collect::<Vec<_>>();
|
||||
caughtup_peers.sort_by(|p1, p2| p1.sk_id.cmp(&p2.sk_id));
|
||||
|
||||
// To distribute the load, shift by timeline_id.
|
||||
let offloader = caughtup_peers
|
||||
[(u128::from(ttid.timeline_id) % caughtup_peers.len() as u128) as usize]
|
||||
.sk_id;
|
||||
|
||||
let mut capable_peers_dbg = capable_peers
|
||||
.map(|p| (p.sk_id, p.commit_lsn))
|
||||
.collect::<Vec<_>>();
|
||||
capable_peers_dbg.sort_by(|p1, p2| p1.0.cmp(&p2.0));
|
||||
(
|
||||
Some(offloader),
|
||||
format!(
|
||||
"elected {} among {:?} peers, with {} of them being caughtup",
|
||||
offloader,
|
||||
capable_peers_dbg,
|
||||
caughtup_peers.len()
|
||||
),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Based on peer information determine which safekeeper should offload; if it
|
||||
/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
|
||||
/// is running, kill it.
|
||||
async fn update_task(
|
||||
conf: &SafeKeeperConf,
|
||||
ttid: TenantTimelineId,
|
||||
task: &mut WalBackupTimelineEntry,
|
||||
entry: &mut WalBackupTimelineEntry,
|
||||
) {
|
||||
if !task.timeline.can_wal_backup() {
|
||||
return;
|
||||
let alive_peers = entry.timeline.get_peers(conf);
|
||||
let wal_backup_lsn = entry.timeline.get_wal_backup_lsn();
|
||||
let (offloader, election_dbg_str) =
|
||||
determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf);
|
||||
let elected_me = Some(conf.my_id) == offloader;
|
||||
|
||||
if elected_me != (entry.handle.is_some()) {
|
||||
if elected_me {
|
||||
info!("elected for backup {}: {}", ttid, election_dbg_str);
|
||||
|
||||
let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
|
||||
let timeline_dir = conf.timeline_dir(&ttid);
|
||||
|
||||
let handle = tokio::spawn(
|
||||
backup_task_main(ttid, timeline_dir, shutdown_rx)
|
||||
.instrument(info_span!("WAL backup task", ttid = %ttid)),
|
||||
);
|
||||
|
||||
entry.handle = Some(WalBackupTaskHandle {
|
||||
shutdown_tx,
|
||||
handle,
|
||||
});
|
||||
} else {
|
||||
info!("stepping down from backup {}: {}", ttid, election_dbg_str);
|
||||
shut_down_task(ttid, entry).await;
|
||||
}
|
||||
}
|
||||
info!("starting WAL backup task for {}", ttid);
|
||||
|
||||
// TODO: decide who should offload right here by simply checking current
|
||||
// state instead of running elections in offloading task.
|
||||
let election_name = SubscriptionKey {
|
||||
cluster_prefix: conf.broker_etcd_prefix.clone(),
|
||||
kind: SubscriptionKind::Operation(
|
||||
ttid,
|
||||
NodeKind::Safekeeper,
|
||||
OperationKind::Safekeeper(SkOperationKind::WalBackup),
|
||||
),
|
||||
}
|
||||
.watch_key();
|
||||
let my_candidate_name = broker::get_candiate_name(conf.my_id);
|
||||
let election = broker::Election::new(
|
||||
election_name,
|
||||
my_candidate_name,
|
||||
conf.broker_endpoints.clone(),
|
||||
);
|
||||
|
||||
let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
|
||||
let timeline_dir = conf.timeline_dir(&ttid);
|
||||
|
||||
let handle = tokio::spawn(
|
||||
backup_task_main(ttid, timeline_dir, shutdown_rx, election)
|
||||
.instrument(info_span!("WAL backup task", ttid = %ttid)),
|
||||
);
|
||||
|
||||
task.handle = Some(WalBackupTaskHandle {
|
||||
shutdown_tx,
|
||||
handle,
|
||||
});
|
||||
}
|
||||
|
||||
const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
|
||||
@@ -158,27 +211,20 @@ async fn wal_backup_launcher_main_loop(
|
||||
timeline,
|
||||
handle: None,
|
||||
});
|
||||
consider_start_task(&conf, ttid, entry);
|
||||
update_task(&conf, ttid, entry).await;
|
||||
} else {
|
||||
// need to stop the task
|
||||
info!("stopping WAL backup task for {}", ttid);
|
||||
|
||||
let entry = tasks.remove(&ttid).unwrap();
|
||||
if let Some(wb_handle) = entry.handle {
|
||||
// Tell the task to shutdown. Error means task exited earlier, that's ok.
|
||||
let _ = wb_handle.shutdown_tx.send(()).await;
|
||||
// Await the task itself. TODO: restart panicked tasks earlier.
|
||||
if let Err(e) = wb_handle.handle.await {
|
||||
warn!("WAL backup task for {} panicked: {}", ttid, e);
|
||||
}
|
||||
}
|
||||
let mut entry = tasks.remove(&ttid).unwrap();
|
||||
shut_down_task(ttid, &mut entry).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Start known tasks, if needed and possible.
|
||||
// For each timeline needing offloading, check if this safekeeper
|
||||
// should do the job and start/stop the task accordingly.
|
||||
_ = ticker.tick() => {
|
||||
for (ttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) {
|
||||
consider_start_task(&conf, *ttid, entry);
|
||||
for (ttid, entry) in tasks.iter_mut() {
|
||||
update_task(&conf, *ttid, entry).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -190,17 +236,13 @@ struct WalBackupTask {
|
||||
timeline_dir: PathBuf,
|
||||
wal_seg_size: usize,
|
||||
commit_lsn_watch_rx: watch::Receiver<Lsn>,
|
||||
leader: Option<ElectionLeader>,
|
||||
election: Election,
|
||||
}
|
||||
|
||||
/// Offload single timeline. Called only after we checked that backup
|
||||
/// is required (wal_backup_attend) and possible (can_wal_backup).
|
||||
/// Offload single timeline.
|
||||
async fn backup_task_main(
|
||||
ttid: TenantTimelineId,
|
||||
timeline_dir: PathBuf,
|
||||
mut shutdown_rx: Receiver<()>,
|
||||
election: Election,
|
||||
) {
|
||||
info!("started");
|
||||
let res = GlobalTimelines::get(ttid);
|
||||
@@ -215,8 +257,6 @@ async fn backup_task_main(
|
||||
commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
|
||||
timeline: tli,
|
||||
timeline_dir,
|
||||
leader: None,
|
||||
election,
|
||||
};
|
||||
|
||||
// task is spinned up only when wal_seg_size already initialized
|
||||
@@ -229,9 +269,6 @@ async fn backup_task_main(
|
||||
canceled = true;
|
||||
}
|
||||
}
|
||||
if let Some(l) = wb.leader {
|
||||
l.give_up().await;
|
||||
}
|
||||
info!("task {}", if canceled { "canceled" } else { "terminated" });
|
||||
}
|
||||
|
||||
@@ -239,106 +276,71 @@ impl WalBackupTask {
|
||||
async fn run(&mut self) {
|
||||
let mut backup_lsn = Lsn(0);
|
||||
|
||||
// election loop
|
||||
let mut retry_attempt = 0u32;
|
||||
// offload loop
|
||||
loop {
|
||||
let mut retry_attempt = 0u32;
|
||||
if retry_attempt == 0 {
|
||||
// wait for new WAL to arrive
|
||||
if let Err(e) = self.commit_lsn_watch_rx.changed().await {
|
||||
// should never happen, as we hold Arc to timeline.
|
||||
error!("commit_lsn watch shut down: {:?}", e);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
// or just sleep if we errored previously
|
||||
let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS;
|
||||
if let Some(backoff_delay) = UPLOAD_FAILURE_RETRY_MIN_MS.checked_shl(retry_attempt)
|
||||
{
|
||||
retry_delay = min(retry_delay, backoff_delay);
|
||||
}
|
||||
sleep(Duration::from_millis(retry_delay)).await;
|
||||
}
|
||||
|
||||
info!("acquiring leadership");
|
||||
if let Err(e) = broker::get_leader(&self.election, &mut self.leader).await {
|
||||
error!("error during leader election {:?}", e);
|
||||
sleep(Duration::from_millis(BROKER_CONNECTION_RETRY_DELAY_MS)).await;
|
||||
let commit_lsn = *self.commit_lsn_watch_rx.borrow();
|
||||
|
||||
// Note that backup_lsn can be higher than commit_lsn if we
|
||||
// don't have much local WAL and others already uploaded
|
||||
// segments we don't even have.
|
||||
if backup_lsn.segment_number(self.wal_seg_size)
|
||||
>= commit_lsn.segment_number(self.wal_seg_size)
|
||||
{
|
||||
retry_attempt = 0;
|
||||
continue; /* nothing to do, common case as we wake up on every commit_lsn bump */
|
||||
}
|
||||
// Perhaps peers advanced the position, check shmem value.
|
||||
backup_lsn = self.timeline.get_wal_backup_lsn();
|
||||
if backup_lsn.segment_number(self.wal_seg_size)
|
||||
>= commit_lsn.segment_number(self.wal_seg_size)
|
||||
{
|
||||
retry_attempt = 0;
|
||||
continue;
|
||||
}
|
||||
info!("acquired leadership");
|
||||
|
||||
// offload loop
|
||||
loop {
|
||||
if retry_attempt == 0 {
|
||||
// wait for new WAL to arrive
|
||||
if let Err(e) = self.commit_lsn_watch_rx.changed().await {
|
||||
// should never happen, as we hold Arc to timeline.
|
||||
error!("commit_lsn watch shut down: {:?}", e);
|
||||
match backup_lsn_range(
|
||||
backup_lsn,
|
||||
commit_lsn,
|
||||
self.wal_seg_size,
|
||||
&self.timeline_dir,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(backup_lsn_result) => {
|
||||
backup_lsn = backup_lsn_result;
|
||||
let res = self.timeline.set_wal_backup_lsn(backup_lsn_result);
|
||||
if let Err(e) = res {
|
||||
error!("failed to set wal_backup_lsn: {}", e);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
// or just sleep if we errored previously
|
||||
let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS;
|
||||
if let Some(backoff_delay) =
|
||||
UPLOAD_FAILURE_RETRY_MIN_MS.checked_shl(retry_attempt)
|
||||
{
|
||||
retry_delay = min(retry_delay, backoff_delay);
|
||||
}
|
||||
sleep(Duration::from_millis(retry_delay)).await;
|
||||
retry_attempt = 0;
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"failed while offloading range {}-{}: {:?}",
|
||||
backup_lsn, commit_lsn, e
|
||||
);
|
||||
|
||||
let commit_lsn = *self.commit_lsn_watch_rx.borrow();
|
||||
|
||||
// Note that backup_lsn can be higher than commit_lsn if we
|
||||
// don't have much local WAL and others already uploaded
|
||||
// segments we don't even have.
|
||||
if backup_lsn.segment_number(self.wal_seg_size)
|
||||
>= commit_lsn.segment_number(self.wal_seg_size)
|
||||
{
|
||||
continue; /* nothing to do, common case as we wake up on every commit_lsn bump */
|
||||
}
|
||||
// Perhaps peers advanced the position, check shmem value.
|
||||
backup_lsn = self.timeline.get_wal_backup_lsn();
|
||||
if backup_lsn.segment_number(self.wal_seg_size)
|
||||
>= commit_lsn.segment_number(self.wal_seg_size)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(l) = self.leader.as_mut() {
|
||||
// Optimization idea for later:
|
||||
// Avoid checking election leader every time by returning current lease grant expiration time
|
||||
// Re-check leadership only after expiration time,
|
||||
// such approach would reduce overhead on write-intensive workloads
|
||||
|
||||
match l
|
||||
.check_am_i(
|
||||
self.election.election_name.clone(),
|
||||
self.election.candidate_name.clone(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(leader) => {
|
||||
if !leader {
|
||||
info!("lost leadership");
|
||||
break;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("error validating leader, {:?}", e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match backup_lsn_range(
|
||||
backup_lsn,
|
||||
commit_lsn,
|
||||
self.wal_seg_size,
|
||||
&self.timeline_dir,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(backup_lsn_result) => {
|
||||
backup_lsn = backup_lsn_result;
|
||||
let res = self.timeline.set_wal_backup_lsn(backup_lsn_result);
|
||||
if let Err(e) = res {
|
||||
error!("backup error: {}", e);
|
||||
return;
|
||||
}
|
||||
retry_attempt = 0;
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"failed while offloading range {}-{}: {:?}",
|
||||
backup_lsn, commit_lsn, e
|
||||
);
|
||||
|
||||
retry_attempt = min(retry_attempt + 1, u32::MAX);
|
||||
if retry_attempt < u32::MAX {
|
||||
retry_attempt += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
51
scripts/docker-compose_test.sh
Executable file
51
scripts/docker-compose_test.sh
Executable file
@@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
|
||||
# this is a shortcut script to avoid duplication in CI
|
||||
set -eux -o pipefail
|
||||
|
||||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
COMPOSE_FILE=$SCRIPT_DIR/../docker-compose/docker-compose.yml
|
||||
|
||||
COMPUTE_CONTAINER_NAME=dockercompose_compute_1
|
||||
SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
|
||||
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
|
||||
|
||||
cleanup() {
|
||||
echo "show container information"
|
||||
docker ps
|
||||
docker-compose -f $COMPOSE_FILE logs
|
||||
echo "stop containers..."
|
||||
docker-compose -f $COMPOSE_FILE down
|
||||
}
|
||||
|
||||
echo "clean up containers if exists"
|
||||
cleanup
|
||||
|
||||
for pg_version in 14 15; do
|
||||
echo "start containers (pg_version=$pg_version)."
|
||||
PG_VERSION=$pg_version TAG=latest docker-compose -f $COMPOSE_FILE up --build -d
|
||||
|
||||
echo "wait until the compute is ready. timeout after 60s. "
|
||||
cnt=0
|
||||
while sleep 1; do
|
||||
# check timeout
|
||||
cnt=`expr $cnt + 1`
|
||||
if [ $cnt -gt 60 ]; then
|
||||
echo "timeout before the compute is ready."
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check if the compute is ready
|
||||
set +o pipefail
|
||||
result=`docker-compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
|
||||
set -o pipefail
|
||||
if [ $result -eq 1 ]; then
|
||||
echo "OK. The compute is ready to connect."
|
||||
echo "execute simple queries."
|
||||
docker exec -it $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
|
||||
cleanup
|
||||
break
|
||||
fi
|
||||
done
|
||||
done
|
||||
@@ -4,18 +4,12 @@
|
||||
# Outline of steps:
|
||||
# 1. Get `(last_lsn, prev_lsn)` from old pageserver
|
||||
# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file
|
||||
# 3. This tar file might be missing relation files for empty relations, if the pageserver
|
||||
# is old enough (we didn't always store those). So to recreate them, we start a local
|
||||
# vanilla postgres on this basebackup and ask it what relations should exist, then touch
|
||||
# any missing files and re-pack the tar.
|
||||
# TODO This functionality is no longer needed, so we can delete it later if we don't
|
||||
# end up using the same utils for the pg 15 upgrade. Not sure.
|
||||
# 4. We import the patched basebackup into a new pageserver
|
||||
# 5. We export again via fullbackup, now from the new pageserver and compare the returned
|
||||
# 3. We import the basebackup into a new pageserver
|
||||
# 4. We export again via fullbackup, now from the new pageserver and compare the returned
|
||||
# tar file with the one we imported. This confirms that we imported everything that was
|
||||
# exported, but doesn't guarantee correctness (what if we didn't **export** everything
|
||||
# initially?)
|
||||
# 6. We wait for the new pageserver's remote_consistent_lsn to catch up
|
||||
# 5. We wait for the new pageserver's remote_consistent_lsn to catch up
|
||||
#
|
||||
# For more context on how to use this, see:
|
||||
# https://github.com/neondatabase/cloud/wiki/Storage-format-migration
|
||||
@@ -24,17 +18,13 @@ import argparse
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
from contextlib import closing
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, cast
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import psycopg2
|
||||
import requests
|
||||
from psycopg2.extensions import connection as PgConnection
|
||||
from psycopg2.extensions import parse_dsn
|
||||
|
||||
###############################################
|
||||
### client-side utils copied from test fixtures
|
||||
@@ -135,105 +125,6 @@ class PgBin:
|
||||
)
|
||||
|
||||
|
||||
class PgProtocol:
|
||||
"""Reusable connection logic"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.default_options = kwargs
|
||||
|
||||
def conn_options(self, **kwargs):
|
||||
conn_options = self.default_options.copy()
|
||||
if "dsn" in kwargs:
|
||||
conn_options.update(parse_dsn(kwargs["dsn"]))
|
||||
conn_options.update(kwargs)
|
||||
|
||||
# Individual statement timeout in seconds. 2 minutes should be
|
||||
# enough for our tests, but if you need a longer, you can
|
||||
# change it by calling "SET statement_timeout" after
|
||||
# connecting.
|
||||
conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}"
|
||||
|
||||
return conn_options
|
||||
|
||||
# autocommit=True here by default because that's what we need most of the time
|
||||
def connect(self, autocommit=True, **kwargs) -> PgConnection:
|
||||
"""
|
||||
Connect to the node.
|
||||
Returns psycopg2's connection object.
|
||||
This method passes all extra params to connstr.
|
||||
"""
|
||||
conn = psycopg2.connect(**self.conn_options(**kwargs))
|
||||
|
||||
# WARNING: this setting affects *all* tests!
|
||||
conn.autocommit = autocommit
|
||||
return conn
|
||||
|
||||
def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]:
|
||||
"""
|
||||
Execute query against the node and return all rows.
|
||||
This method passes all extra params to connstr.
|
||||
"""
|
||||
return self.safe_psql_many([query], **kwargs)[0]
|
||||
|
||||
def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
|
||||
"""
|
||||
Execute queries against the node and return all rows.
|
||||
This method passes all extra params to connstr.
|
||||
"""
|
||||
result: List[List[Any]] = []
|
||||
with closing(self.connect(**kwargs)) as conn:
|
||||
with conn.cursor() as cur:
|
||||
for query in queries:
|
||||
print(f"Executing query: {query}")
|
||||
cur.execute(query)
|
||||
|
||||
if cur.description is None:
|
||||
result.append([]) # query didn't return data
|
||||
else:
|
||||
result.append(cast(List[Any], cur.fetchall()))
|
||||
return result
|
||||
|
||||
|
||||
class VanillaPostgres(PgProtocol):
|
||||
def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
|
||||
super().__init__(host="localhost", port=port, dbname="postgres")
|
||||
self.pgdatadir = pgdatadir
|
||||
self.pg_bin = pg_bin
|
||||
self.running = False
|
||||
if init:
|
||||
self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)])
|
||||
self.configure([f"port = {port}\n"])
|
||||
|
||||
def configure(self, options: List[str]):
|
||||
"""Append lines into postgresql.conf file."""
|
||||
assert not self.running
|
||||
with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
|
||||
conf_file.write("\n".join(options))
|
||||
|
||||
def start(self, log_path: Optional[str] = None):
|
||||
assert not self.running
|
||||
self.running = True
|
||||
|
||||
if log_path is None:
|
||||
log_path = os.path.join(self.pgdatadir, "pg.log")
|
||||
|
||||
self.pg_bin.run_capture(
|
||||
["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"]
|
||||
)
|
||||
|
||||
def stop(self):
|
||||
assert self.running
|
||||
self.running = False
|
||||
self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"])
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb):
|
||||
if self.running:
|
||||
self.stop()
|
||||
|
||||
|
||||
class NeonPageserverApiException(Exception):
|
||||
pass
|
||||
|
||||
@@ -370,84 +261,6 @@ def pack_base(log_dir, restored_dir, output_tar):
|
||||
shutil.move(tmp_tar_path, output_tar)
|
||||
|
||||
|
||||
def reconstruct_paths(log_dir, pg_bin, base_tar):
|
||||
"""Reconstruct what relation files should exist in the datadir by querying postgres."""
|
||||
with tempfile.TemporaryDirectory() as restored_dir:
|
||||
# Unpack the base tar
|
||||
subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir])
|
||||
|
||||
# Start a vanilla postgres from the given datadir and query it to find
|
||||
# what relfiles should exist, but possibly don't.
|
||||
port = "55439" # Probably free
|
||||
with VanillaPostgres(restored_dir, pg_bin, port, init=False) as vanilla_pg:
|
||||
vanilla_pg.configure([f"port={port}"])
|
||||
vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log"))
|
||||
|
||||
# Create database based on template0 because we can't connect to template0
|
||||
query = "create database template0copy template template0"
|
||||
vanilla_pg.safe_psql(query, user="cloud_admin")
|
||||
vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin")
|
||||
|
||||
# Get all databases
|
||||
query = "select oid, datname from pg_database"
|
||||
oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin")
|
||||
template0_oid = [
|
||||
oid for (oid, database) in oid_dbname_pairs if database == "template0"
|
||||
][0]
|
||||
|
||||
# Get rel paths for each database
|
||||
for oid, database in oid_dbname_pairs:
|
||||
if database == "template0":
|
||||
# We can't connect to template0
|
||||
continue
|
||||
|
||||
query = "select relname, pg_relation_filepath(oid) from pg_class"
|
||||
result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database)
|
||||
for relname, filepath in result:
|
||||
if filepath is not None:
|
||||
|
||||
if database == "template0copy":
|
||||
# Add all template0copy paths to template0
|
||||
prefix = f"base/{oid}/"
|
||||
if filepath.startswith(prefix):
|
||||
suffix = filepath[len(prefix) :]
|
||||
yield f"base/{template0_oid}/{suffix}"
|
||||
elif filepath.startswith("global"):
|
||||
print(f"skipping {database} global file {filepath}")
|
||||
else:
|
||||
raise AssertionError
|
||||
else:
|
||||
yield filepath
|
||||
|
||||
|
||||
def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths):
|
||||
"""Add the appropriate empty files to a basebadkup tar."""
|
||||
with tempfile.TemporaryDirectory() as restored_dir:
|
||||
# Unpack the base tar
|
||||
subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir])
|
||||
|
||||
# Touch files that don't exist
|
||||
for path in paths:
|
||||
absolute_path = os.path.join(restored_dir, path)
|
||||
exists = os.path.exists(absolute_path)
|
||||
if not exists:
|
||||
print(f"File {absolute_path} didn't exist. Creating..")
|
||||
Path(absolute_path).touch()
|
||||
|
||||
# Repackage
|
||||
pack_base(log_dir, restored_dir, output_tar)
|
||||
|
||||
|
||||
# HACK This is a workaround for exporting from old pageservers that
|
||||
# can't export empty relations. In this case we need to start
|
||||
# a vanilla postgres from the exported datadir, and query it
|
||||
# to see what empty relations are missing, and then create
|
||||
# those empty files before importing.
|
||||
def add_missing_rels(base_tar, output_tar, log_dir, pg_bin):
|
||||
reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar))
|
||||
touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths)
|
||||
|
||||
|
||||
def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
|
||||
conn = psycopg2.connect(pageserver_connstr)
|
||||
conn.autocommit = True
|
||||
@@ -516,7 +329,6 @@ def export_timeline(
|
||||
pg_version,
|
||||
):
|
||||
# Choose filenames
|
||||
incomplete_filename = tar_filename + ".incomplete"
|
||||
stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr")
|
||||
|
||||
# Construct export command
|
||||
@@ -525,18 +337,14 @@ def export_timeline(
|
||||
|
||||
# Run export command
|
||||
print(f"Running: {cmd}")
|
||||
with open(incomplete_filename, "w") as stdout_f:
|
||||
with open(tar_filename, "w") as stdout_f:
|
||||
with open(stderr_filename, "w") as stderr_f:
|
||||
print(f"(capturing output to {incomplete_filename})")
|
||||
print(f"(capturing output to {tar_filename})")
|
||||
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
|
||||
subprocess.run(
|
||||
cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True
|
||||
)
|
||||
|
||||
# Add missing rels
|
||||
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
|
||||
add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin)
|
||||
|
||||
# Log more info
|
||||
file_size = os.path.getsize(tar_filename)
|
||||
print(f"Done export: {tar_filename}, size {file_size}")
|
||||
@@ -633,6 +441,13 @@ def main(args: argparse.Namespace):
|
||||
raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}")
|
||||
|
||||
|
||||
def non_zero_tcp_port(arg: Any):
|
||||
port = int(arg)
|
||||
if port < 1 or port > 65535:
|
||||
raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}")
|
||||
return port
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
@@ -728,5 +543,13 @@ if __name__ == "__main__":
|
||||
default=False,
|
||||
help="directory where temporary tar files are stored",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tmp-pg-port",
|
||||
dest="tmp_pg_port",
|
||||
required=False,
|
||||
default=55439,
|
||||
type=non_zero_tcp_port,
|
||||
help="localhost port to use for temporary postgres instance",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@@ -6,6 +6,6 @@ set -euox pipefail
|
||||
echo 'Reformatting Rust code'
|
||||
cargo fmt
|
||||
echo 'Reformatting Python code'
|
||||
poetry run isort test_runner
|
||||
poetry run flake8 test_runner
|
||||
poetry run black test_runner
|
||||
poetry run isort test_runner scripts
|
||||
poetry run flake8 test_runner scripts
|
||||
poetry run black test_runner scripts
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user