mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-01 04:20:39 +00:00
Compare commits
1 Commits
neon-broke
...
nikitakaly
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4bd2dc4941 |
@@ -5,7 +5,6 @@
|
||||
!Cargo.lock
|
||||
!Makefile
|
||||
|
||||
!broker/
|
||||
!.cargo/
|
||||
!.config/
|
||||
!control_plane/
|
||||
|
||||
41
.github/actions/run-python-test-set/action.yml
vendored
41
.github/actions/run-python-test-set/action.yml
vendored
@@ -55,22 +55,6 @@ runs:
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
- name: Download Neon binaries for the previous release
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon-previous
|
||||
prefix: latest
|
||||
|
||||
- name: Download compatibility snapshot for Postgres 14
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg14
|
||||
path: /tmp/compatibility_snapshot_pg14
|
||||
prefix: latest
|
||||
|
||||
- name: Checkout
|
||||
if: inputs.needs_postgres_source == 'true'
|
||||
uses: actions/checkout@v3
|
||||
@@ -89,18 +73,22 @@ runs:
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Download compatibility snapshot for Postgres 14
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg14
|
||||
path: /tmp/compatibility_snapshot_pg14
|
||||
prefix: latest
|
||||
|
||||
- name: Run pytest
|
||||
env:
|
||||
NEON_BIN: /tmp/neon/bin
|
||||
COMPATIBILITY_NEON_BIN: /tmp/neon-previous/bin
|
||||
COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: ${{ inputs.build_type }}
|
||||
AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
|
||||
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
|
||||
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
|
||||
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
|
||||
ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# PLATFORM will be embedded in the perf test report
|
||||
@@ -123,12 +111,7 @@ runs:
|
||||
exit 1
|
||||
fi
|
||||
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
|
||||
# -n4 uses four processes to run tests via pytest-xdist
|
||||
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
|
||||
|
||||
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
|
||||
# to the same worker to make @pytest.mark.order work with xdist
|
||||
EXTRA_PARAMS="--dist=loadgroup $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
|
||||
@@ -163,9 +146,9 @@ runs:
|
||||
# --verbose prints name of each test (helpful when there are
|
||||
# multiple tests in one file)
|
||||
# -rA prints summary in the end
|
||||
# -n4 uses four processes to run tests via pytest-xdist
|
||||
# -s is not used to prevent pytest from capturing output, because tests are running
|
||||
# in parallel and logs are mixed between different tests
|
||||
#
|
||||
mkdir -p $TEST_OUTPUT/allure/results
|
||||
"${cov_prefix[@]}" ./scripts/pytest \
|
||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||
@@ -185,12 +168,12 @@ runs:
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
|
||||
# The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
|
||||
path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
|
||||
# The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
|
||||
path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
|
||||
prefix: latest
|
||||
|
||||
- name: Create Allure report
|
||||
if: success() || failure()
|
||||
if: always()
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: store
|
||||
|
||||
1
.github/ansible/get_binaries.sh
vendored
1
.github/ansible/get_binaries.sh
vendored
@@ -25,7 +25,6 @@ mkdir neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/pageserver_binutils neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/neon_broker neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/
|
||||
docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/
|
||||
|
||||
35
.github/ansible/prod.ap-southeast-1.hosts.yaml
vendored
35
.github/ansible/prod.ap-southeast-1.hosts.yaml
vendored
@@ -1,35 +0,0 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-ap-southeast-1
|
||||
bucket_region: ap-southeast-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: ap-southeast-1
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
|
||||
console_region_id: aws-ap-southeast-1
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-064de8ea28bdb495b
|
||||
pageserver-1.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0b180defcaeeb6b93
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0d6f1dc5161eef894
|
||||
safekeeper-1.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0e338adda8eb2d19f
|
||||
safekeeper-2.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-04fb63634e4679eb9
|
||||
35
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
35
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
@@ -1,35 +0,0 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-eu-central-1
|
||||
bucket_region: eu-central-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: eu-central-1
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
|
||||
console_region_id: aws-eu-central-1
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-0cd8d316ecbb715be
|
||||
pageserver-1.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-090044ed3d383fef0
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-0b238612d2318a050
|
||||
safekeeper-1.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-07b9c45e5c2637cd4
|
||||
safekeeper-2.eu-central-1.aws.neon.tech:
|
||||
ansible_host: i-020257302c3c93d88
|
||||
36
.github/ansible/prod.us-east-2.hosts.yaml
vendored
36
.github/ansible/prod.us-east-2.hosts.yaml
vendored
@@ -1,36 +0,0 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: us-east-2
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
|
||||
console_region_id: aws-us-east-2
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-062227ba7f119eb8c
|
||||
pageserver-1.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-0b3ec0afab5968938
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-0e94224750c57d346
|
||||
safekeeper-1.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-06d113fb73bfddeb0
|
||||
safekeeper-2.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-09f66c8e04afff2e8
|
||||
|
||||
1
.github/ansible/ssm_config
vendored
1
.github/ansible/ssm_config
vendored
@@ -1,2 +1,3 @@
|
||||
ansible_connection: aws_ssm
|
||||
ansible_aws_ssm_bucket_name: neon-dev-bucket
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
|
||||
1
.github/ansible/staging.us-east-2.hosts.yaml
vendored
1
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -14,7 +14,6 @@ storage:
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: us-east-2
|
||||
ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
|
||||
console_region_id: aws-us-east-2
|
||||
|
||||
children:
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: ap-southeast-1
|
||||
zenith_region_slug: ap-southeast-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
@@ -1,31 +0,0 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.eu-central-1.aws.neon.tech"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: eu-central-1
|
||||
zenith_region_slug: eu-central-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
@@ -1,31 +0,0 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.tech"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
2
.github/workflows/benchmarking.yml
vendored
2
.github/workflows/benchmarking.yml
vendored
@@ -265,7 +265,7 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Create Allure report
|
||||
if: success() || failure()
|
||||
if: always()
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
|
||||
242
.github/workflows/build_and_test.yml
vendored
242
.github/workflows/build_and_test.yml
vendored
@@ -127,8 +127,8 @@ jobs:
|
||||
target/
|
||||
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||
key: |
|
||||
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-
|
||||
v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
@@ -268,6 +268,32 @@ jobs:
|
||||
if: matrix.build_type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
upload-latest-artifacts:
|
||||
runs-on: dev
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ regress-tests ]
|
||||
if: github.ref_name == 'main'
|
||||
steps:
|
||||
- name: Copy Neon artifact to the latest directory
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
PREFIX: artifacts/${{ github.run_id }}
|
||||
run: |
|
||||
for build_type in debug release; do
|
||||
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME}
|
||||
done
|
||||
|
||||
benchmarks:
|
||||
runs-on: dev
|
||||
container:
|
||||
@@ -305,7 +331,7 @@ jobs:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ regress-tests, benchmarks ]
|
||||
if: success() || failure()
|
||||
if: always()
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -363,7 +389,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git/
|
||||
target/
|
||||
key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
- name: Get Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -419,15 +445,11 @@ jobs:
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
needs: [ push-docker-hub, tag ]
|
||||
needs: [ build-neon ]
|
||||
steps:
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
run: |
|
||||
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
|
||||
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
|
||||
# to place a job run status update later.
|
||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
|
||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||
|
||||
REMOTE_REPO="${{ github.repository_owner }}/cloud"
|
||||
@@ -453,9 +475,7 @@ jobs:
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
|
||||
\"remote_repo\": \"${{ github.repository }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
@@ -492,6 +512,26 @@ jobs:
|
||||
- name: Kaniko build compute tools
|
||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
compute-node-image:
|
||||
runs-on: dev
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
needs: [ tag ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
# compute-node uses postgres 14, which is default now
|
||||
# cloud repo depends on this image name, thus duplicating it
|
||||
# remove compute-node when cloud repo is updated
|
||||
- name: Kaniko build compute node with extensions v14 (compatibility)
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
compute-node-image-v14:
|
||||
runs-on: dev
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
@@ -509,6 +549,7 @@ jobs:
|
||||
- name: Kaniko build compute node with extensions v14
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
|
||||
compute-node-image-v15:
|
||||
runs-on: dev
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
@@ -526,58 +567,18 @@ jobs:
|
||||
- name: Kaniko build compute node with extensions v15
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
test-images:
|
||||
needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
|
||||
runs-on: dev
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
# `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
|
||||
# Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
|
||||
# Regular pageserver version string looks like
|
||||
# Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: []
|
||||
# Bad versions might loop like:
|
||||
# Neon page server git-env:local failpoints: true, features: ["testing"]
|
||||
# Ensure that we don't have bad versions.
|
||||
- name: Verify image versions
|
||||
shell: bash # ensure no set -e for better error messages
|
||||
run: |
|
||||
pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
|
||||
|
||||
echo "Pageserver version string: $pageserver_version"
|
||||
|
||||
if ! echo "$pageserver_version" | grep -qv 'git-env:local' ; then
|
||||
echo "Pageserver version should not be the default Dockerfile one"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! echo "$pageserver_version" | grep -qv '"testing"' ; then
|
||||
echo "Pageserver version should have no testing feature enabled"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify docker-compose example
|
||||
run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
|
||||
|
||||
- name: Print logs and clean up
|
||||
if: always()
|
||||
run: |
|
||||
docker compose -f ./docker-compose/docker-compose.yml logs || 0
|
||||
docker compose -f ./docker-compose/docker-compose.yml down
|
||||
|
||||
promote-images:
|
||||
runs-on: dev
|
||||
needs: [ tag, test-images ]
|
||||
needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
|
||||
if: github.event_name != 'workflow_dispatch'
|
||||
container: amazon/aws-cli
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]
|
||||
# compute-node uses postgres 14, which is default now
|
||||
# cloud repo depends on this image name, thus duplicating it
|
||||
# remove compute-node when cloud repo is updated
|
||||
name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ]
|
||||
|
||||
steps:
|
||||
- name: Promote image to latest
|
||||
@@ -607,6 +608,9 @@ jobs:
|
||||
- name: Pull compute tools image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools
|
||||
|
||||
- name: Pull compute node image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node
|
||||
|
||||
- name: Pull compute node v14 image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
|
||||
|
||||
@@ -621,10 +625,11 @@ jobs:
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
run: |
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
|
||||
|
||||
- name: Configure Docker Hub login
|
||||
run: |
|
||||
@@ -638,6 +643,9 @@ jobs:
|
||||
- name: Push compute tools image to Docker Hub
|
||||
run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push compute node image to Docker Hub
|
||||
run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push compute node v14 image to Docker Hub
|
||||
run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
@@ -654,6 +662,7 @@ jobs:
|
||||
run: |
|
||||
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
@@ -747,9 +756,9 @@ jobs:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ us-east-2 ]
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -772,47 +781,7 @@ jobs:
|
||||
fi
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-prod-new:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
cd "$(pwd)/.github/ansible"
|
||||
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
./get_binaries.sh
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
RELEASE=true ./get_binaries.sh
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
|
||||
ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-proxy:
|
||||
@@ -868,11 +837,6 @@ jobs:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: dev-us-east-2-beta
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -883,51 +847,14 @@ jobs:
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
aws --region us-east-2 eks update-kubeconfig --name dev-us-east-2-beta --role-arn arn:aws:iam::369495373322:role/github-runner
|
||||
|
||||
- name: Re-deploy proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
deploy-proxy-prod-new:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: us-east-2
|
||||
target_cluster: prod-us-east-2-delta
|
||||
- target_region: eu-central-1
|
||||
target_cluster: prod-eu-central-1-gamma
|
||||
- target_region: ap-southeast-1
|
||||
target_cluster: prod-ap-southeast-1-epsilon
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Re-deploy proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
promote-compatibility-data:
|
||||
promote-compatibility-test-snapshot:
|
||||
runs-on: dev
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
@@ -941,24 +868,9 @@ jobs:
|
||||
BUCKET: neon-github-public-dev
|
||||
PREFIX: artifacts/latest
|
||||
run: |
|
||||
# Update compatibility snapshot for the release
|
||||
for build_type in debug release; do
|
||||
OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
|
||||
NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
|
||||
|
||||
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
|
||||
done
|
||||
|
||||
# Update Neon artifact for the release (reuse already uploaded artifact)
|
||||
for build_type in debug release; do
|
||||
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
|
||||
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
|
||||
done
|
||||
|
||||
6
.github/workflows/codestyle.yml
vendored
6
.github/workflows/codestyle.yml
vendored
@@ -48,11 +48,11 @@ jobs:
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev protobuf-compiler
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev
|
||||
|
||||
- name: Install macOS postgres dependencies
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: brew install flex bison openssl protobuf
|
||||
run: brew install flex bison openssl
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
@@ -106,7 +106,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
|
||||
key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
|
||||
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
|
||||
2
.gitmodules
vendored
2
.gitmodules
vendored
@@ -1,7 +1,7 @@
|
||||
[submodule "vendor/postgres-v14"]
|
||||
path = vendor/postgres-v14
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
branch = REL_14_STABLE_neon
|
||||
branch = main
|
||||
[submodule "vendor/postgres-v15"]
|
||||
path = vendor/postgres-v15
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
|
||||
10
CODEOWNERS
10
CODEOWNERS
@@ -1,10 +0,0 @@
|
||||
/compute_tools/ @neondatabase/control-plane
|
||||
/control_plane/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/pageserver/ @neondatabase/compute @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/proxy/ @neondatabase/control-plane
|
||||
/safekeeper/ @neondatabase/safekeepers
|
||||
304
Cargo.lock
generated
304
Cargo.lock
generated
@@ -317,6 +317,12 @@ dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boxfnonce"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426"
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "1.0.1"
|
||||
@@ -457,26 +463,11 @@ checksum = "6bf8832993da70a4c6d13c581f4463c2bdda27b9bf1c5498dc4365543abe6d6f"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"bitflags",
|
||||
"clap_derive",
|
||||
"clap_lex 0.3.0",
|
||||
"once_cell",
|
||||
"strsim",
|
||||
"termcolor",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "4.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c42f169caba89a7d512b5418b09864543eeb4d497416c917d7137863bd2076ad"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "0.2.4"
|
||||
@@ -569,42 +560,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "console-api"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e57ff02e8ad8e06ab9731d5dc72dc23bef9200778eae1a89d555d8c42e5d4a86"
|
||||
dependencies = [
|
||||
"prost 0.11.2",
|
||||
"prost-types 0.11.2",
|
||||
"tonic 0.8.2",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "console-subscriber"
|
||||
version = "0.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22a3a81dfaf6b66bce5d159eddae701e3a002f194d378cbf7be5f053c281d9be"
|
||||
dependencies = [
|
||||
"console-api",
|
||||
"crossbeam-channel",
|
||||
"crossbeam-utils",
|
||||
"futures",
|
||||
"hdrhistogram",
|
||||
"humantime",
|
||||
"prost-types 0.11.2",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thread_local",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tonic 0.8.2",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_format"
|
||||
version = "0.2.30"
|
||||
@@ -645,7 +600,6 @@ dependencies = [
|
||||
"tar",
|
||||
"thiserror",
|
||||
"toml",
|
||||
"url",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -895,6 +849,16 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "daemonize"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815"
|
||||
dependencies = [
|
||||
"boxfnonce",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.14.1"
|
||||
@@ -1056,11 +1020,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fb8664f6ea68aba5503d42dd1be786b0f1bd9b7972e7f40208c83ef74db91bf"
|
||||
dependencies = [
|
||||
"http",
|
||||
"prost 0.10.4",
|
||||
"prost",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tonic 0.7.2",
|
||||
"tonic-build 0.7.2",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"tower",
|
||||
"tower-service",
|
||||
]
|
||||
@@ -1138,16 +1102,6 @@ version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.0.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"miniz_oxide",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
@@ -1390,19 +1344,6 @@ dependencies = [
|
||||
"ahash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hdrhistogram"
|
||||
version = "7.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f19b9f54f7c7f55e31401bb647626ce0cf0f67b0004982ce815b3ee72a02aa8"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"byteorder",
|
||||
"flate2",
|
||||
"nom",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heapless"
|
||||
version = "0.7.16"
|
||||
@@ -1964,32 +1905,6 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "546c37ac5d9e56f55e73b677106873d9d9f5190605e41a856503623648488cae"
|
||||
|
||||
[[package]]
|
||||
name = "neon_broker"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"bytes",
|
||||
"clap 4.0.15",
|
||||
"console-subscriber",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"prost 0.11.2",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tonic 0.8.2",
|
||||
"tonic-build 0.8.2",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.23.1"
|
||||
@@ -2225,6 +2140,7 @@ dependencies = [
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"crossbeam-utils",
|
||||
"daemonize",
|
||||
"etcd_broker",
|
||||
"fail",
|
||||
"futures",
|
||||
@@ -2245,7 +2161,6 @@ dependencies = [
|
||||
"postgres-types",
|
||||
"postgres_ffi",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"rand",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
@@ -2258,7 +2173,6 @@ dependencies = [
|
||||
"svg_fmt",
|
||||
"tar",
|
||||
"tempfile",
|
||||
"tenant_size_model",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
@@ -2276,7 +2190,6 @@ name = "pageserver_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"const_format",
|
||||
"postgres_ffi",
|
||||
@@ -2539,21 +2452,6 @@ version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
|
||||
|
||||
[[package]]
|
||||
name = "pq_proto"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"rand",
|
||||
"serde",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prettyplease"
|
||||
version = "0.1.21"
|
||||
@@ -2564,30 +2462,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
||||
dependencies = [
|
||||
"proc-macro-error-attr",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error-attr"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-hack"
|
||||
version = "0.5.19"
|
||||
@@ -2639,17 +2513,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "71adf41db68aa0daaefc69bb30bcd68ded9b9abaad5d1fbb6304c4fb390e083e"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost-derive 0.10.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a0841812012b2d4a6145fae9a6af1534873c32aa67fff26bd09f8fa42c83f95a"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost-derive 0.11.2",
|
||||
"prost-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2667,35 +2531,13 @@ dependencies = [
|
||||
"log",
|
||||
"multimap",
|
||||
"petgraph",
|
||||
"prost 0.10.4",
|
||||
"prost-types 0.10.1",
|
||||
"prost",
|
||||
"prost-types",
|
||||
"regex",
|
||||
"tempfile",
|
||||
"which",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-build"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d8b442418ea0822409d9e7d047cbf1e7e9e1760b172bf9982cf29d517c93511"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"multimap",
|
||||
"petgraph",
|
||||
"prettyplease",
|
||||
"prost 0.11.2",
|
||||
"prost-types 0.11.2",
|
||||
"regex",
|
||||
"syn",
|
||||
"tempfile",
|
||||
"which",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-derive"
|
||||
version = "0.10.1"
|
||||
@@ -2709,19 +2551,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-derive"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "164ae68b6587001ca506d3bf7f1000bfa248d0e1217b618108fba4ec1d0cc306"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"itertools",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-types"
|
||||
version = "0.10.1"
|
||||
@@ -2729,17 +2558,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d0a014229361011dc8e69c8a1ec6c2e8d0f2af7c91e3ea3f5b2170298461e68"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost 0.10.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-types"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "747761bc3dc48f9a34553bf65605cf6cb6288ba219f3450b4275dbd81539551a"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost 0.11.2",
|
||||
"prost",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2765,7 +2584,6 @@ dependencies = [
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"pin-project-lite",
|
||||
"pq_proto",
|
||||
"rand",
|
||||
"rcgen",
|
||||
"reqwest",
|
||||
@@ -3269,6 +3087,7 @@ dependencies = [
|
||||
"clap 4.0.15",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"daemonize",
|
||||
"etcd_broker",
|
||||
"fs2",
|
||||
"git-version",
|
||||
@@ -3276,13 +3095,11 @@ dependencies = [
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"nix 0.25.0",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"safekeeper_api",
|
||||
@@ -3731,13 +3548,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tenant_size_model"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.1.3"
|
||||
@@ -3850,12 +3660,10 @@ dependencies = [
|
||||
"mio",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"pin-project-lite",
|
||||
"signal-hook-registry",
|
||||
"socket2",
|
||||
"tokio-macros",
|
||||
"tracing",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
@@ -4003,40 +3811,8 @@ dependencies = [
|
||||
"hyper-timeout",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"prost 0.10.4",
|
||||
"prost-derive 0.10.1",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
"tracing-futures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "55b9af819e54b8f33d453655bef9b9acc171568fb49523078d0cc4e7484200ec"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"axum",
|
||||
"base64",
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"h2",
|
||||
"http",
|
||||
"http-body",
|
||||
"hyper",
|
||||
"hyper-timeout",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"prost 0.11.2",
|
||||
"prost-derive 0.11.2",
|
||||
"prost",
|
||||
"prost-derive",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
@@ -4055,20 +3831,7 @@ checksum = "d9263bf4c9bfaae7317c1c2faf7f18491d2fe476f70c414b73bf5d445b00ffa1"
|
||||
dependencies = [
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
"prost-build 0.10.4",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic-build"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48c6fd7c2581e36d63388a9e04c350c21beb7a8b059580b2e93993c526899ddc"
|
||||
dependencies = [
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
"prost-build 0.11.2",
|
||||
"prost-build",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
@@ -4290,7 +4053,9 @@ dependencies = [
|
||||
"metrics",
|
||||
"nix 0.25.0",
|
||||
"once_cell",
|
||||
"pq_proto",
|
||||
"pin-project-lite",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"rand",
|
||||
"routerify",
|
||||
"rustls",
|
||||
@@ -4615,9 +4380,6 @@ dependencies = [
|
||||
"crossbeam-utils",
|
||||
"either",
|
||||
"fail",
|
||||
"futures-channel",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
"hashbrown",
|
||||
"indexmap",
|
||||
"libc",
|
||||
@@ -4627,12 +4389,10 @@ dependencies = [
|
||||
"num-bigint",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"prost 0.10.4",
|
||||
"prost 0.11.2",
|
||||
"prost",
|
||||
"rand",
|
||||
"regex",
|
||||
"regex-syntax",
|
||||
"reqwest",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"stable_deref_trait",
|
||||
|
||||
@@ -11,7 +11,6 @@ cargo-features = ["named-profiles"]
|
||||
|
||||
[workspace]
|
||||
members = [
|
||||
"broker",
|
||||
"compute_tools",
|
||||
"control_plane",
|
||||
"pageserver",
|
||||
@@ -26,10 +25,6 @@ members = [
|
||||
# Besides, debug info should not affect the performance.
|
||||
debug = true
|
||||
|
||||
# disable debug symbols for all packages except this one to decrease binaries size
|
||||
[profile.release.package."*"]
|
||||
debug = false
|
||||
|
||||
[profile.release-line-debug]
|
||||
inherits = "release"
|
||||
debug = 1 # true = 2 = all symbols, 1 = line only
|
||||
|
||||
@@ -44,7 +44,7 @@ COPY . .
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin neon_broker --bin proxy --locked --release \
|
||||
&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin proxy --locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
# Build final image
|
||||
@@ -67,7 +67,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
|
||||
@@ -13,7 +13,7 @@ ARG TAG=pinned
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -24,7 +24,7 @@ RUN apt update && \
|
||||
FROM build-deps AS pg-build
|
||||
COPY vendor/postgres-v14 postgres
|
||||
RUN cd postgres && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
|
||||
@@ -13,7 +13,7 @@ ARG TAG=pinned
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -24,7 +24,7 @@ RUN apt update && \
|
||||
FROM build-deps AS pg-build
|
||||
COPY vendor/postgres-v15 postgres
|
||||
RUN cd postgres && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
|
||||
88
Dockerfile.compute-node.legacy
Normal file
88
Dockerfile.compute-node.legacy
Normal file
@@ -0,0 +1,88 @@
|
||||
#
|
||||
# Legacy version of the Dockerfile for the compute node.
|
||||
# Used by e2e CI. Building Dockerfile.compute-node will take
|
||||
# unreasonable ammount of time without v2 runners.
|
||||
#
|
||||
# TODO: remove once cloud repo CI is moved to v2 runners.
|
||||
#
|
||||
|
||||
|
||||
# Allow specifiyng different compute-tools tag and image repo, so we are
|
||||
# able to use different images
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG IMAGE=compute-tools
|
||||
ARG TAG=latest
|
||||
|
||||
#
|
||||
# Image with pre-built tools
|
||||
#
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
|
||||
# Only to get ready compute_ctl binary as deppendency
|
||||
|
||||
#
|
||||
# Image with Postgres build deps
|
||||
#
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
|
||||
RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
||||
libcurl4-openssl-dev libossp-uuid-dev
|
||||
|
||||
#
|
||||
# Image with built Postgres
|
||||
#
|
||||
FROM build-deps AS pg-build
|
||||
|
||||
# Add user postgres
|
||||
RUN adduser postgres
|
||||
RUN mkdir /pg && chown postgres:postgres /pg
|
||||
|
||||
# Copy source files
|
||||
# version 14 is default for now
|
||||
COPY ./vendor/postgres-v14 /pg/
|
||||
COPY ./pgxn /pg/
|
||||
|
||||
# Build and install Postgres locally
|
||||
RUN mkdir /pg/compute_build && cd /pg/compute_build && \
|
||||
../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
|
||||
# Install main binaries and contribs
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
|
||||
|
||||
# Install neon contrib
|
||||
RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install
|
||||
|
||||
USER postgres
|
||||
WORKDIR /pg
|
||||
|
||||
#
|
||||
# Final compute node image to be exported
|
||||
#
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
# libreadline-dev is required to run psql
|
||||
RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
|
||||
|
||||
# Add user postgres
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute
|
||||
|
||||
# Copy ready Postgres binaries
|
||||
COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
|
||||
|
||||
# Copy binaries from compute-tools
|
||||
COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# XXX: temporary symlink for compatibility with old control-plane
|
||||
RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
|
||||
|
||||
# Add postgres shared objects to the search path
|
||||
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
|
||||
USER postgres
|
||||
|
||||
ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||
10
Makefile
10
Makefile
@@ -151,11 +151,6 @@ neon-pg-ext-v14: postgres-v14
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
|
||||
+@echo "Compiling neon_walredo v14"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
|
||||
+@echo "Compiling neon_test_utils" v14
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
|
||||
@@ -168,11 +163,6 @@ neon-pg-ext-v15: postgres-v15
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
|
||||
+@echo "Compiling neon_walredo v15"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
|
||||
+@echo "Compiling neon_test_utils" v15
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
|
||||
|
||||
@@ -35,12 +35,12 @@ Pageserver consists of:
|
||||
* On Ubuntu or Debian, this set of packages should be sufficient to build the code:
|
||||
```bash
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client protobuf-compiler
|
||||
libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client
|
||||
```
|
||||
* On Fedora, these packages are needed:
|
||||
```bash
|
||||
dnf install flex bison readline-devel zlib-devel openssl-devel \
|
||||
libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib protobuf-compiler
|
||||
libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
@@ -223,7 +223,10 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
|
||||
# either:
|
||||
CARGO_BUILD_FLAGS="--features=testing" make
|
||||
# or:
|
||||
make debug
|
||||
|
||||
./scripts/pytest
|
||||
```
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
[package]
|
||||
name = "neon_broker"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[features]
|
||||
bench = []
|
||||
# for exploring with tokio-console. Note that tokio_unstable cfg must be enabled, i.e.
|
||||
# RUSTFLAGS="--cfg tokio_unstable" cargo build -r -p neon_broker --features console
|
||||
console = ["dep:console-subscriber", "tokio/full", "tokio/tracing"]
|
||||
|
||||
[[bin]]
|
||||
name = "neon_broker_bench"
|
||||
path = "src/bin/bench.rs"
|
||||
# build benchmarking binary only if explicitly requested with '--features bench'
|
||||
required-features = ["bench"]
|
||||
|
||||
[dependencies]
|
||||
async-stream = "0.3"
|
||||
bytes = "1.0"
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
git-version = "0.3.5"
|
||||
humantime = "2.1.0"
|
||||
hyper = {version = "0.14.14", features = ["full"]}
|
||||
once_cell = "1.13.0"
|
||||
parking_lot = "0.12"
|
||||
prost = "0.11"
|
||||
tonic = "0.8"
|
||||
tokio = { version = "1.0", features = ["macros", "rt-multi-thread"] }
|
||||
tokio-stream = "0.1"
|
||||
tracing = "0.1.27"
|
||||
|
||||
console-subscriber = {version = "0.1.8", optional = true }
|
||||
|
||||
metrics = { path = "../libs/metrics" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
[build-dependencies]
|
||||
tonic-build = "0.8"
|
||||
@@ -1,7 +0,0 @@
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Generate code to deterministic location to make finding it easier.
|
||||
tonic_build::configure()
|
||||
.out_dir("proto/") // put generated code to proto/
|
||||
.compile(&["proto/broker.proto"], &["proto/"])?;
|
||||
Ok(())
|
||||
}
|
||||
2
broker/proto/.gitignore
vendored
2
broker/proto/.gitignore
vendored
@@ -1,2 +0,0 @@
|
||||
# protobuf generated code
|
||||
neon_broker.rs
|
||||
@@ -1,44 +0,0 @@
|
||||
syntax = "proto3";
|
||||
|
||||
import "google/protobuf/empty.proto";
|
||||
|
||||
package neon_broker;
|
||||
|
||||
service NeonBroker {
|
||||
// Subscribe to safekeeper updates.
|
||||
rpc SubscribeSafekeeperInfo(SubscribeSafekeeperInfoRequest) returns (stream SafekeeperTimelineInfo) {};
|
||||
|
||||
// Publish safekeeper updates.
|
||||
rpc PublishSafekeeperInfo(stream SafekeeperTimelineInfo) returns (google.protobuf.Empty) {};
|
||||
}
|
||||
|
||||
message SubscribeSafekeeperInfoRequest {
|
||||
oneof subscription_key {
|
||||
google.protobuf.Empty all = 1; // subscribe to everything
|
||||
TenantTimelineId tenant_timeline_id = 2; // subscribe to specific timeline
|
||||
}
|
||||
}
|
||||
|
||||
message SafekeeperTimelineInfo {
|
||||
uint64 safekeeper_id = 1;
|
||||
TenantTimelineId tenant_timeline_id = 2;
|
||||
// Term of the last entry.
|
||||
uint64 last_log_term = 3;
|
||||
// LSN of the last record.
|
||||
uint64 flush_lsn = 4;
|
||||
// Up to which LSN safekeeper regards its WAL as committed.
|
||||
uint64 commit_lsn = 5;
|
||||
// LSN up to which safekeeper has backed WAL.
|
||||
uint64 backup_lsn = 6;
|
||||
// LSN of last checkpoint uploaded by pageserver.
|
||||
uint64 remote_consistent_lsn = 7;
|
||||
uint64 peer_horizon_lsn = 8;
|
||||
uint64 local_start_lsn = 9;
|
||||
// A connection string to use for WAL receiving.
|
||||
string safekeeper_connstr = 10;
|
||||
}
|
||||
|
||||
message TenantTimelineId {
|
||||
bytes tenant_id = 1;
|
||||
bytes timeline_id = 2;
|
||||
}
|
||||
@@ -1,174 +0,0 @@
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use clap::Parser;
|
||||
use neon_broker::neon_broker_proto::subscribe_safekeeper_info_request::SubscriptionKey;
|
||||
use neon_broker::neon_broker_proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use neon_broker::neon_broker_proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
|
||||
use neon_broker::NeonBrokerClientChannel;
|
||||
use neon_broker::DEFAULT_LISTEN_ADDR;
|
||||
use tokio::time::{self};
|
||||
|
||||
use tonic::Request;
|
||||
|
||||
const ABOUT: &str = r#"
|
||||
A simple benchmarking tool for neon_broker. Creates specified number of per
|
||||
timeline publishers and subscribers; each publisher continiously sends
|
||||
messages, subscribers read them. Each second the tool outputs number of
|
||||
messages summed across all subscribers and min number of messages
|
||||
recevied by single subscriber.
|
||||
|
||||
For example,
|
||||
cargo build -r -p neon_broker --features bench && target/release/neon_broker
|
||||
target/release/neon_broker_bench -s 1 -p 1
|
||||
"#;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[clap(author, version, about = ABOUT)]
|
||||
struct Args {
|
||||
/// Number of publishers
|
||||
#[clap(short = 'p', long, value_parser, default_value_t = 1)]
|
||||
num_pubs: u64,
|
||||
/// Number of subscribers
|
||||
#[clap(short = 's', long, value_parser, default_value_t = 1)]
|
||||
num_subs: u64,
|
||||
}
|
||||
|
||||
async fn progress_reporter(counters: Vec<Arc<AtomicU64>>) {
|
||||
let mut interval = time::interval(Duration::from_millis(1000));
|
||||
let mut c_old = counters.iter().map(|c| c.load(Ordering::Relaxed)).sum();
|
||||
let mut c_min_old = counters
|
||||
.iter()
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.min()
|
||||
.unwrap_or(0);
|
||||
let mut started_at = None;
|
||||
let mut skipped: u64 = 0;
|
||||
loop {
|
||||
interval.tick().await;
|
||||
let c_new = counters.iter().map(|c| c.load(Ordering::Relaxed)).sum();
|
||||
let c_min_new = counters
|
||||
.iter()
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.min()
|
||||
.unwrap_or(0);
|
||||
if c_new > 0 && started_at.is_none() {
|
||||
started_at = Some(Instant::now());
|
||||
skipped = c_new;
|
||||
}
|
||||
let avg_rps = started_at.map(|s| {
|
||||
let dur = s.elapsed();
|
||||
let dur_secs = dur.as_secs() as f64 + (dur.subsec_millis() as f64) / 1000.0;
|
||||
let avg_rps = (c_new - skipped) as f64 / dur_secs;
|
||||
(dur, avg_rps)
|
||||
});
|
||||
println!(
|
||||
"sum rps {}, min rps {} total {}, total min {}, duration, avg sum rps {:?}",
|
||||
c_new - c_old,
|
||||
c_min_new - c_min_old,
|
||||
c_new,
|
||||
c_min_new,
|
||||
avg_rps
|
||||
);
|
||||
c_old = c_new;
|
||||
c_min_old = c_min_new;
|
||||
}
|
||||
}
|
||||
|
||||
fn tli_from_u64(i: u64) -> Vec<u8> {
|
||||
let mut timeline_id = vec![0xFF; 8];
|
||||
timeline_id.extend_from_slice(&i.to_be_bytes());
|
||||
timeline_id
|
||||
}
|
||||
|
||||
async fn subscribe(client: Option<NeonBrokerClientChannel>, counter: Arc<AtomicU64>, i: u64) {
|
||||
let mut client = match client {
|
||||
Some(c) => c,
|
||||
None => NeonBrokerClientChannel::connect_lazy(format!("http://{}", DEFAULT_LISTEN_ADDR))
|
||||
.await
|
||||
.unwrap(),
|
||||
};
|
||||
|
||||
// let key = SubscriptionKey::All(());
|
||||
let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
|
||||
tenant_id: vec![0xFF; 16],
|
||||
timeline_id: tli_from_u64(i),
|
||||
});
|
||||
let request = SubscribeSafekeeperInfoRequest {
|
||||
subscription_key: Some(key),
|
||||
};
|
||||
let mut stream = client
|
||||
.subscribe_safekeeper_info(request)
|
||||
.await
|
||||
.unwrap()
|
||||
.into_inner();
|
||||
|
||||
while let Some(_feature) = stream.message().await.unwrap() {
|
||||
counter.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
async fn publish(client: Option<NeonBrokerClientChannel>, n_keys: u64) {
|
||||
let mut client = match client {
|
||||
Some(c) => c,
|
||||
None => NeonBrokerClientChannel::connect_lazy(format!("http://{}", DEFAULT_LISTEN_ADDR))
|
||||
.await
|
||||
.unwrap(),
|
||||
};
|
||||
let mut counter: u64 = 0;
|
||||
|
||||
// create stream producing new values
|
||||
let outbound = async_stream::stream! {
|
||||
loop {
|
||||
let info = SafekeeperTimelineInfo {
|
||||
safekeeper_id: 1,
|
||||
tenant_timeline_id: Some(ProtoTenantTimelineId {
|
||||
tenant_id: vec![0xFF; 16],
|
||||
timeline_id: tli_from_u64(counter % n_keys),
|
||||
}),
|
||||
last_log_term: 0,
|
||||
flush_lsn: counter,
|
||||
commit_lsn: 2,
|
||||
backup_lsn: 3,
|
||||
remote_consistent_lsn: 4,
|
||||
peer_horizon_lsn: 5,
|
||||
safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(),
|
||||
local_start_lsn: 0,
|
||||
};
|
||||
counter += 1;
|
||||
yield info;
|
||||
}
|
||||
};
|
||||
let response = client.publish_safekeeper_info(Request::new(outbound)).await;
|
||||
println!("pub response is {:?}", response);
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let args = Args::parse();
|
||||
|
||||
let mut counters = Vec::with_capacity(args.num_subs as usize);
|
||||
for _ in 0..args.num_subs {
|
||||
counters.push(Arc::new(AtomicU64::new(0)));
|
||||
}
|
||||
let h = tokio::spawn(progress_reporter(counters.clone()));
|
||||
|
||||
let c = NeonBrokerClientChannel::connect_lazy(format!("http://{}", DEFAULT_LISTEN_ADDR))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for i in 0..args.num_subs {
|
||||
let c = Some(c.clone());
|
||||
// let c = None;
|
||||
tokio::spawn(subscribe(c, counters[i as usize].clone(), i));
|
||||
}
|
||||
for _i in 0..args.num_pubs {
|
||||
// let c = Some(c.clone());
|
||||
let c = None;
|
||||
tokio::spawn(publish(c, args.num_subs as u64));
|
||||
}
|
||||
|
||||
h.await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,510 +0,0 @@
|
||||
//! Simple pub-sub based on grpc (tonic) and Tokio mpsc for storage nodes
|
||||
//! messaging. The main design goal is to avoid central synchronization during
|
||||
//! normal flow, resorting to it only when pub/sub change happens. Each
|
||||
//! subscriber holds mpsc for messages it sits on; tx end is sent to existing
|
||||
//! publishers and saved in shared state for new ones. Publishers maintain
|
||||
//! locally set of subscribers they stream messages to.
|
||||
//!
|
||||
//! Subscriptions to 1) single timeline 2) everything are possible. We could add
|
||||
//! subscription to set of timelines to save grpc streams, but testing shows
|
||||
//! many individual streams is also ok.
|
||||
//!
|
||||
//! Message is dropped if subscriber can't consume it, not affecting other
|
||||
//! subscribers.
|
||||
//!
|
||||
//! Only safekeeper message is supported, but it is not hard to add something
|
||||
//! else with generics.
|
||||
use clap::{command, Parser};
|
||||
use futures_core::Stream;
|
||||
use futures_util::StreamExt;
|
||||
use hyper::header::CONTENT_TYPE;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, StatusCode};
|
||||
use parking_lot::RwLock;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::Infallible;
|
||||
use std::net::SocketAddr;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::sync::broadcast::error::RecvError;
|
||||
use tonic::codegen::Service;
|
||||
use tonic::Code;
|
||||
use tonic::{Request, Response, Status};
|
||||
use tracing::*;
|
||||
|
||||
use metrics::{Encoder, TextEncoder};
|
||||
use neon_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE};
|
||||
use neon_broker::neon_broker_proto::neon_broker_server::{NeonBroker, NeonBrokerServer};
|
||||
use neon_broker::neon_broker_proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
|
||||
use neon_broker::neon_broker_proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
|
||||
use neon_broker::{parse_proto_ttid, EitherBody, DEFAULT_LISTEN_ADDR};
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::project_git_version;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_CHAN_SIZE: usize = 256;
|
||||
const DEFAULT_HTTP2_KEEPALIVE_INTERVAL: &str = "5000ms";
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(version = GIT_VERSION, about = "Broker for neon storage nodes communication", long_about = None)]
|
||||
struct Args {
|
||||
/// Endpoint to listen on.
|
||||
#[arg(short, long, default_value = DEFAULT_LISTEN_ADDR)]
|
||||
listen_addr: SocketAddr,
|
||||
/// Size of the queue to the subscriber.
|
||||
#[arg(long, default_value_t = DEFAULT_CHAN_SIZE)]
|
||||
chan_size: usize,
|
||||
/// HTTP/2 keepalive interval.
|
||||
#[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HTTP2_KEEPALIVE_INTERVAL)]
|
||||
http2_keepalive_interval: Duration,
|
||||
/// Format for logging, either 'plain' or 'json'.
|
||||
#[arg(long, default_value = "plain")]
|
||||
log_format: String,
|
||||
}
|
||||
|
||||
type PubId = u64; // id of publisher for registering in maps
|
||||
type SubId = u64; // id of subscriber for registering in maps
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
enum SubscriptionKey {
|
||||
All,
|
||||
Timeline(TenantTimelineId),
|
||||
}
|
||||
|
||||
impl SubscriptionKey {
|
||||
// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
|
||||
pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result<Self, Status> {
|
||||
match key {
|
||||
ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All),
|
||||
ProtoSubscriptionKey::TenantTimelineId(proto_ttid) => {
|
||||
Ok(SubscriptionKey::Timeline(parse_proto_ttid(&proto_ttid)?))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Channel to timeline subscribers.
|
||||
struct ChanToTimelineSub {
|
||||
chan: broadcast::Sender<SafekeeperTimelineInfo>,
|
||||
// Tracked separately to know when delete the shmem entry. receiver_count()
|
||||
// is unhandy for that as unregistering and dropping the receiver side
|
||||
// happens at different moments.
|
||||
num_subscribers: u64,
|
||||
}
|
||||
|
||||
struct SharedState {
|
||||
next_pub_id: PubId,
|
||||
num_pubs: i64,
|
||||
next_sub_id: SubId,
|
||||
num_subs_to_timelines: i64,
|
||||
chans_to_timeline_subs: HashMap<TenantTimelineId, ChanToTimelineSub>,
|
||||
num_subs_to_all: i64,
|
||||
chan_to_all_subs: broadcast::Sender<SafekeeperTimelineInfo>,
|
||||
}
|
||||
|
||||
impl SharedState {
|
||||
pub fn new(chan_size: usize) -> Self {
|
||||
SharedState {
|
||||
next_pub_id: 0,
|
||||
num_pubs: 0,
|
||||
next_sub_id: 0,
|
||||
num_subs_to_timelines: 0,
|
||||
chans_to_timeline_subs: HashMap::new(),
|
||||
num_subs_to_all: 0,
|
||||
chan_to_all_subs: broadcast::channel(chan_size).0,
|
||||
}
|
||||
}
|
||||
|
||||
// Register new publisher.
|
||||
pub fn register_publisher(&mut self, registry: Registry) -> Publisher {
|
||||
let pub_id = self.next_pub_id;
|
||||
self.next_pub_id += 1;
|
||||
self.num_pubs += 1;
|
||||
NUM_PUBS.set(self.num_pubs);
|
||||
Publisher {
|
||||
id: pub_id,
|
||||
registry,
|
||||
}
|
||||
}
|
||||
|
||||
// Unregister publisher.
|
||||
pub fn unregister_publisher(&mut self) {
|
||||
self.num_pubs -= 1;
|
||||
NUM_PUBS.set(self.num_pubs);
|
||||
}
|
||||
|
||||
// Register new subscriber.
|
||||
pub fn register_subscriber(
|
||||
&mut self,
|
||||
sub_key: SubscriptionKey,
|
||||
registry: Registry,
|
||||
chan_size: usize,
|
||||
) -> Subscriber {
|
||||
let sub_id = self.next_sub_id;
|
||||
self.next_sub_id += 1;
|
||||
let sub_rx = match sub_key {
|
||||
SubscriptionKey::All => {
|
||||
self.num_subs_to_all += 1;
|
||||
NUM_SUBS_ALL.set(self.num_subs_to_all);
|
||||
self.chan_to_all_subs.subscribe()
|
||||
}
|
||||
SubscriptionKey::Timeline(ttid) => {
|
||||
self.num_subs_to_timelines += 1;
|
||||
NUM_SUBS_TIMELINE.set(self.num_subs_to_timelines);
|
||||
// Create new broadcast channel for this key, or subscriber to
|
||||
// the existing one.
|
||||
let chan_to_timeline_sub =
|
||||
self.chans_to_timeline_subs
|
||||
.entry(ttid)
|
||||
.or_insert(ChanToTimelineSub {
|
||||
chan: broadcast::channel(chan_size).0,
|
||||
num_subscribers: 0,
|
||||
});
|
||||
chan_to_timeline_sub.num_subscribers += 1;
|
||||
chan_to_timeline_sub.chan.subscribe()
|
||||
}
|
||||
};
|
||||
Subscriber {
|
||||
id: sub_id,
|
||||
key: sub_key,
|
||||
sub_rx,
|
||||
registry,
|
||||
}
|
||||
}
|
||||
|
||||
// Unregister the subscriber.
|
||||
pub fn unregister_subscriber(&mut self, sub_key: SubscriptionKey) {
|
||||
match sub_key {
|
||||
SubscriptionKey::All => {
|
||||
self.num_subs_to_all -= 1;
|
||||
NUM_SUBS_ALL.set(self.num_subs_to_all);
|
||||
}
|
||||
SubscriptionKey::Timeline(ttid) => {
|
||||
self.num_subs_to_timelines -= 1;
|
||||
NUM_SUBS_TIMELINE.set(self.num_subs_to_timelines);
|
||||
|
||||
// Remove from the map, destroying the channel, if we are the
|
||||
// last subscriber to this timeline.
|
||||
|
||||
// Missing entry is a bug; we must have registered.
|
||||
let chan_to_timeline_sub = self.chans_to_timeline_subs.get_mut(&ttid).unwrap();
|
||||
chan_to_timeline_sub.num_subscribers -= 1;
|
||||
if chan_to_timeline_sub.num_subscribers == 0 {
|
||||
self.chans_to_timeline_subs.remove(&ttid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SharedState wrapper.
|
||||
#[derive(Clone)]
|
||||
struct Registry {
|
||||
shared_state: Arc<RwLock<SharedState>>,
|
||||
chan_size: usize,
|
||||
}
|
||||
|
||||
impl Registry {
|
||||
// Register new publisher in shared state.
|
||||
pub fn register_publisher(&self) -> Publisher {
|
||||
let publisher = self.shared_state.write().register_publisher(self.clone());
|
||||
trace!("registered publisher {}", publisher.id);
|
||||
publisher
|
||||
}
|
||||
|
||||
pub fn unregister_publisher(&self, publisher: &Publisher) {
|
||||
self.shared_state.write().unregister_publisher();
|
||||
trace!("unregistered publisher {}", publisher.id);
|
||||
}
|
||||
|
||||
// Register new subscriber in shared state.
|
||||
pub fn register_subscriber(&self, sub_key: SubscriptionKey) -> Subscriber {
|
||||
let subscriber =
|
||||
self.shared_state
|
||||
.write()
|
||||
.register_subscriber(sub_key, self.clone(), self.chan_size);
|
||||
trace!("registered subscriber {}", subscriber.id);
|
||||
subscriber
|
||||
}
|
||||
|
||||
// Unregister the subscriber
|
||||
pub fn unregister_subscriber(&self, subscriber: &Subscriber) {
|
||||
self.shared_state
|
||||
.write()
|
||||
.unregister_subscriber(subscriber.key);
|
||||
trace!("unregistered subscriber {}", subscriber.id);
|
||||
}
|
||||
}
|
||||
|
||||
// Private subscriber state.
|
||||
struct Subscriber {
|
||||
id: SubId,
|
||||
key: SubscriptionKey,
|
||||
// Subscriber receives messages from publishers here.
|
||||
sub_rx: broadcast::Receiver<SafekeeperTimelineInfo>,
|
||||
// to unregister itself from shared state in Drop
|
||||
registry: Registry,
|
||||
}
|
||||
|
||||
impl Drop for Subscriber {
|
||||
fn drop(&mut self) {
|
||||
self.registry.unregister_subscriber(self);
|
||||
}
|
||||
}
|
||||
|
||||
// Private publisher state
|
||||
struct Publisher {
|
||||
id: PubId,
|
||||
registry: Registry,
|
||||
}
|
||||
|
||||
impl Publisher {
|
||||
// Send msg to relevant subscribers.
|
||||
pub fn send_msg(&mut self, msg: &SafekeeperTimelineInfo) -> Result<(), Status> {
|
||||
// send message to subscribers for everything
|
||||
let shared_state = self.registry.shared_state.read();
|
||||
// Err means there is no subscribers, it is fine.
|
||||
shared_state.chan_to_all_subs.send(msg.clone()).ok();
|
||||
|
||||
// send message to per timeline subscribers
|
||||
let ttid =
|
||||
parse_proto_ttid(msg.tenant_timeline_id.as_ref().ok_or_else(|| {
|
||||
Status::new(Code::InvalidArgument, "missing tenant_timeline_id")
|
||||
})?)?;
|
||||
if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) {
|
||||
// Err can't happen here, as rx is destroyed after removing from the
|
||||
// map the last subscriber.
|
||||
subs.chan.send(msg.clone()).unwrap();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Publisher {
|
||||
fn drop(&mut self) {
|
||||
self.registry.unregister_publisher(self);
|
||||
}
|
||||
}
|
||||
|
||||
struct NeonBrokerImpl {
|
||||
registry: Registry,
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl NeonBroker for NeonBrokerImpl {
|
||||
async fn publish_safekeeper_info(
|
||||
&self,
|
||||
request: Request<tonic::Streaming<SafekeeperTimelineInfo>>,
|
||||
) -> Result<Response<()>, Status> {
|
||||
let mut publisher = self.registry.register_publisher();
|
||||
|
||||
let mut stream = request.into_inner();
|
||||
|
||||
loop {
|
||||
match stream.next().await {
|
||||
Some(Ok(msg)) => publisher.send_msg(&msg)?,
|
||||
Some(Err(e)) => return Err(e), // grpc error from the stream
|
||||
None => break, // closed stream
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Response::new(()))
|
||||
}
|
||||
|
||||
type SubscribeSafekeeperInfoStream =
|
||||
Pin<Box<dyn Stream<Item = Result<SafekeeperTimelineInfo, Status>> + Send + 'static>>;
|
||||
|
||||
async fn subscribe_safekeeper_info(
|
||||
&self,
|
||||
request: Request<SubscribeSafekeeperInfoRequest>,
|
||||
) -> Result<Response<Self::SubscribeSafekeeperInfoStream>, Status> {
|
||||
let proto_key = request
|
||||
.into_inner()
|
||||
.subscription_key
|
||||
.ok_or_else(|| Status::new(Code::InvalidArgument, "missing subscription key"))?;
|
||||
let sub_key = SubscriptionKey::from_proto_subscription_key(proto_key)?;
|
||||
let mut subscriber = self.registry.register_subscriber(sub_key);
|
||||
|
||||
// transform rx into stream with item = Result, as method result demands
|
||||
let output = async_stream::try_stream! {
|
||||
loop {
|
||||
match subscriber.sub_rx.recv().await {
|
||||
Ok(info) => yield info,
|
||||
Err(RecvError::Lagged(_skipped_msg)) => {
|
||||
// warn!("dropped {} messages, channel is full", skipped_msg);
|
||||
}
|
||||
Err(RecvError::Closed) => {
|
||||
// can't happen, we never drop the channel while there is a subscriber
|
||||
Err(Status::new(Code::Internal, "channel unexpectantly closed"))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Response::new(
|
||||
Box::pin(output) as Self::SubscribeSafekeeperInfoStream
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
// We serve only metrics through http1.
|
||||
async fn http1_handler(
|
||||
req: hyper::Request<hyper::body::Body>,
|
||||
) -> Result<hyper::Response<Body>, Infallible> {
|
||||
let resp = match (req.method(), req.uri().path()) {
|
||||
(&Method::GET, "/metrics") => {
|
||||
let mut buffer = vec![];
|
||||
let metrics = metrics::gather();
|
||||
let encoder = TextEncoder::new();
|
||||
encoder.encode(&metrics, &mut buffer).unwrap();
|
||||
|
||||
hyper::Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(CONTENT_TYPE, encoder.format_type())
|
||||
.body(Body::from(buffer))
|
||||
.unwrap()
|
||||
}
|
||||
_ => hyper::Response::builder()
|
||||
.status(StatusCode::NOT_FOUND)
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
};
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let args = Args::parse();
|
||||
|
||||
#[cfg(not(feature = "console"))] // tokio-console inits tracing_subscriber on its own
|
||||
utils::logging::init(utils::logging::LogFormat::from_config(&args.log_format)?)?;
|
||||
info!("version: {GIT_VERSION}");
|
||||
|
||||
#[cfg(feature = "console")]
|
||||
console_subscriber::init();
|
||||
|
||||
let registry = Registry {
|
||||
shared_state: Arc::new(RwLock::new(SharedState::new(args.chan_size))),
|
||||
chan_size: args.chan_size,
|
||||
};
|
||||
let neon_broker_impl = NeonBrokerImpl {
|
||||
registry: registry.clone(),
|
||||
};
|
||||
let neon_broker_server = NeonBrokerServer::new(neon_broker_impl);
|
||||
|
||||
info!("listening on {}", &args.listen_addr);
|
||||
|
||||
// grpc is served along with http1 for metrics on a single port, hence we
|
||||
// don't use tonic's Server.
|
||||
hyper::Server::bind(&args.listen_addr)
|
||||
.http2_keep_alive_interval(Some(args.http2_keepalive_interval))
|
||||
.serve(make_service_fn(move |_| {
|
||||
let neon_broker_server_cloned = neon_broker_server.clone();
|
||||
async move {
|
||||
Ok::<_, Infallible>(service_fn(move |req| {
|
||||
// Technically this second clone is not needed, but consume
|
||||
// by async block is apparently unavoidable. BTW, error
|
||||
// message is enigmatic, see
|
||||
// https://github.com/rust-lang/rust/issues/68119
|
||||
//
|
||||
// We could get away without async block at all, but then we
|
||||
// need to resort to futures::Either to merge the result,
|
||||
// which doesn't caress an eye as well.
|
||||
let mut neon_broker_server_svc = neon_broker_server_cloned.clone();
|
||||
async move {
|
||||
if req.headers().get("content-type").map(|x| x.as_bytes())
|
||||
== Some(b"application/grpc")
|
||||
{
|
||||
let res_resp = neon_broker_server_svc.call(req).await;
|
||||
// Grpc and http1 handlers have slightly different
|
||||
// Response types: it is UnsyncBoxBody for the
|
||||
// former one (not sure why) and plain hyper::Body
|
||||
// for the latter. Both implement HttpBody though,
|
||||
// and EitherBody is used to merge them.
|
||||
res_resp.map(|resp| resp.map(EitherBody::Left))
|
||||
} else {
|
||||
let res_resp = http1_handler(req).await;
|
||||
res_resp.map(|resp| resp.map(EitherBody::Right))
|
||||
}
|
||||
}
|
||||
}))
|
||||
}
|
||||
}))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use neon_broker::neon_broker_proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use tokio::sync::broadcast::error::TryRecvError;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
fn msg(timeline_id: Vec<u8>) -> SafekeeperTimelineInfo {
|
||||
SafekeeperTimelineInfo {
|
||||
safekeeper_id: 1,
|
||||
tenant_timeline_id: Some(ProtoTenantTimelineId {
|
||||
tenant_id: vec![0x00; 16],
|
||||
timeline_id,
|
||||
}),
|
||||
last_log_term: 0,
|
||||
flush_lsn: 1,
|
||||
commit_lsn: 2,
|
||||
backup_lsn: 3,
|
||||
remote_consistent_lsn: 4,
|
||||
peer_horizon_lsn: 5,
|
||||
safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(),
|
||||
local_start_lsn: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn tli_from_u64(i: u64) -> Vec<u8> {
|
||||
let mut timeline_id = vec![0xFF; 8];
|
||||
timeline_id.extend_from_slice(&i.to_be_bytes());
|
||||
timeline_id
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_registry() {
|
||||
let registry = Registry {
|
||||
shared_state: Arc::new(RwLock::new(SharedState::new(16))),
|
||||
chan_size: 16,
|
||||
};
|
||||
|
||||
// subscribe to timeline 2
|
||||
let ttid_2 = TenantTimelineId {
|
||||
tenant_id: TenantId::from_slice(&[0x00; 16]).unwrap(),
|
||||
timeline_id: TimelineId::from_slice(&tli_from_u64(2)).unwrap(),
|
||||
};
|
||||
let sub_key_2 = SubscriptionKey::Timeline(ttid_2);
|
||||
let mut subscriber_2 = registry.register_subscriber(sub_key_2);
|
||||
let mut subscriber_all = registry.register_subscriber(SubscriptionKey::All);
|
||||
|
||||
// send two messages with different keys
|
||||
let msg_1 = msg(tli_from_u64(1));
|
||||
let msg_2 = msg(tli_from_u64(2));
|
||||
let mut publisher = registry.register_publisher();
|
||||
publisher.send_msg(&msg_1).expect("failed to send msg");
|
||||
publisher.send_msg(&msg_2).expect("failed to send msg");
|
||||
|
||||
// msg with key 2 should arrive to subscriber_2
|
||||
assert!(
|
||||
matches!(subscriber_2.sub_rx.try_recv(), Ok(msg) if matches!(msg.tenant_timeline_id.as_ref(), Some(ttid) if parse_proto_ttid(ttid).unwrap() == ttid_2))
|
||||
);
|
||||
// but nothing more
|
||||
assert!(
|
||||
matches!(subscriber_2.sub_rx.try_recv(), Err(err) if matches!(err, TryRecvError::Empty))
|
||||
);
|
||||
|
||||
// subscriber_all should receive both messages
|
||||
assert!(matches!(subscriber_all.sub_rx.try_recv(), Ok(..)));
|
||||
assert!(matches!(subscriber_all.sub_rx.try_recv(), Ok(..)));
|
||||
assert!(
|
||||
matches!(subscriber_all.sub_rx.try_recv(), Err(err) if matches!(err, TryRecvError::Empty))
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,108 +0,0 @@
|
||||
use hyper::body::HttpBody;
|
||||
use std::pin::Pin;
|
||||
use std::task::{Context, Poll};
|
||||
use tonic::codegen::StdError;
|
||||
use tonic::{transport::Channel, Code, Status};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use neon_broker_proto::{
|
||||
neon_broker_client::NeonBrokerClient, TenantTimelineId as ProtoTenantTimelineId,
|
||||
};
|
||||
|
||||
// Code generated by protobuf.
|
||||
pub mod neon_broker_proto {
|
||||
include!("../proto/neon_broker.rs");
|
||||
}
|
||||
|
||||
pub mod metrics;
|
||||
|
||||
// Re-exports to avoid direct tonic dependency in user crates.
|
||||
pub use tonic::Request;
|
||||
pub use tonic::Streaming;
|
||||
|
||||
pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
|
||||
|
||||
// NeonBrokerClient charged with tonic provided Channel transport; helps to
|
||||
// avoid depending on tonic directly in user crates.
|
||||
pub type NeonBrokerClientChannel = NeonBrokerClient<Channel>;
|
||||
|
||||
impl NeonBrokerClient<Channel> {
|
||||
/// Create a new client to the given endpoint, but don't actually connect until the first request.
|
||||
pub async fn connect_lazy<D>(dst: D) -> Result<Self, tonic::transport::Error>
|
||||
where
|
||||
D: std::convert::TryInto<tonic::transport::Endpoint>,
|
||||
D::Error: Into<StdError>,
|
||||
{
|
||||
let conn = tonic::transport::Endpoint::new(dst)?.connect_lazy();
|
||||
Ok(Self::new(conn))
|
||||
}
|
||||
}
|
||||
|
||||
// parse variable length bytes from protobuf
|
||||
pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result<TenantTimelineId, Status> {
|
||||
let tenant_id = TenantId::from_slice(&proto_ttid.tenant_id)
|
||||
.map_err(|e| Status::new(Code::InvalidArgument, format!("malformed tenant_id: {}", e)))?;
|
||||
let timeline_id = TimelineId::from_slice(&proto_ttid.timeline_id).map_err(|e| {
|
||||
Status::new(
|
||||
Code::InvalidArgument,
|
||||
format!("malformed timeline_id: {}", e),
|
||||
)
|
||||
})?;
|
||||
Ok(TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
}
|
||||
|
||||
// These several usages don't justify anyhow dependency, though it would work as
|
||||
// well.
|
||||
type AnyError = Box<dyn std::error::Error + Send + Sync + 'static>;
|
||||
|
||||
// Provides impl HttpBody for two different types implementing it. Inspired by
|
||||
// https://github.com/hyperium/tonic/blob/master/examples/src/hyper_warp/server.rs
|
||||
pub enum EitherBody<A, B> {
|
||||
Left(A),
|
||||
Right(B),
|
||||
}
|
||||
|
||||
impl<A, B> HttpBody for EitherBody<A, B>
|
||||
where
|
||||
A: HttpBody + Send + Unpin,
|
||||
B: HttpBody<Data = A::Data> + Send + Unpin,
|
||||
A::Error: Into<AnyError>,
|
||||
B::Error: Into<AnyError>,
|
||||
{
|
||||
type Data = A::Data;
|
||||
type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
|
||||
|
||||
fn is_end_stream(&self) -> bool {
|
||||
match self {
|
||||
EitherBody::Left(b) => b.is_end_stream(),
|
||||
EitherBody::Right(b) => b.is_end_stream(),
|
||||
}
|
||||
}
|
||||
|
||||
fn poll_data(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
) -> Poll<Option<Result<Self::Data, Self::Error>>> {
|
||||
match self.get_mut() {
|
||||
EitherBody::Left(b) => Pin::new(b).poll_data(cx).map(map_option_err),
|
||||
EitherBody::Right(b) => Pin::new(b).poll_data(cx).map(map_option_err),
|
||||
}
|
||||
}
|
||||
|
||||
fn poll_trailers(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
) -> Poll<Result<Option<hyper::HeaderMap>, Self::Error>> {
|
||||
match self.get_mut() {
|
||||
EitherBody::Left(b) => Pin::new(b).poll_trailers(cx).map_err(Into::into),
|
||||
EitherBody::Right(b) => Pin::new(b).poll_trailers(cx).map_err(Into::into),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn map_option_err<T, U: Into<AnyError>>(err: Option<Result<T, U>>) -> Option<Result<T, AnyError>> {
|
||||
err.map(|e| e.map_err(Into::into))
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
//! Broker metrics.
|
||||
|
||||
use metrics::{register_int_gauge, IntGauge};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub static NUM_PUBS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!("broker_active_publishers", "Number of publications")
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
pub static NUM_SUBS_TIMELINE: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"broker_per_timeline_active_subscribers",
|
||||
"Number of subsciptions to particular tenant timeline id"
|
||||
)
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
pub static NUM_SUBS_ALL: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"broker_all_keys_active_subscribers",
|
||||
"Number of subsciptions to all keys"
|
||||
)
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
@@ -65,7 +65,7 @@ impl GenericOption {
|
||||
let name = match self.name.as_str() {
|
||||
"safekeepers" => "neon.safekeepers",
|
||||
"wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
|
||||
"wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout",
|
||||
"wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout",
|
||||
it => it,
|
||||
};
|
||||
|
||||
|
||||
@@ -4,21 +4,20 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
clap = "4.0"
|
||||
comfy-table = "6.1"
|
||||
git-version = "0.3.5"
|
||||
nix = "0.25"
|
||||
once_cell = "1.13.0"
|
||||
tar = "0.4.38"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
regex = "1"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "2.0"
|
||||
tar = "0.4.38"
|
||||
thiserror = "1"
|
||||
toml = "0.5"
|
||||
url = "2.2.2"
|
||||
once_cell = "1.13.0"
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
thiserror = "1"
|
||||
nix = "0.25"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
|
||||
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||
# instead, so that recompile times are better.
|
||||
|
||||
@@ -1,264 +0,0 @@
|
||||
//! Spawns and kills background processes that are needed by Neon CLI.
|
||||
//! Applies common set-up such as log and pid files (if needed) to every process.
|
||||
//!
|
||||
//! Neon CLI does not run in background, so it needs to store the information about
|
||||
//! spawned processes, which it does in this module.
|
||||
//! We do that by storing the pid of the process in the "${process_name}.pid" file.
|
||||
//! The pid file can be created by the process itself
|
||||
//! (Neon storage binaries do that and also ensure that a lock is taken onto that file)
|
||||
//! or we create such file after starting the process
|
||||
//! (non-Neon binaries don't necessarily follow our pidfile conventions).
|
||||
//! The pid stored in the file is later used to stop the service.
|
||||
//!
|
||||
//! See [`lock_file`] module for more info.
|
||||
|
||||
use std::ffi::OsStr;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::process::{Child, Command};
|
||||
use std::time::Duration;
|
||||
use std::{fs, io, thread};
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
|
||||
use utils::lock_file;
|
||||
|
||||
const RETRIES: u32 = 15;
|
||||
const RETRY_TIMEOUT_MILLIS: u64 = 500;
|
||||
|
||||
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
|
||||
/// it itself.
|
||||
pub enum InitialPidFile<'t> {
|
||||
/// Create a pidfile, to allow future CLI invocations to manipulate the process.
|
||||
Create(&'t Path),
|
||||
/// The process will create the pidfile itself, need to wait for that event.
|
||||
Expect(&'t Path),
|
||||
}
|
||||
|
||||
/// Start a background child process using the parameters given.
|
||||
pub fn start_process<F, S: AsRef<OsStr>>(
|
||||
process_name: &str,
|
||||
datadir: &Path,
|
||||
command: &Path,
|
||||
args: &[S],
|
||||
initial_pid_file: InitialPidFile,
|
||||
process_status_check: F,
|
||||
) -> anyhow::Result<Child>
|
||||
where
|
||||
F: Fn() -> anyhow::Result<bool>,
|
||||
{
|
||||
let log_path = datadir.join(format!("{process_name}.log"));
|
||||
let process_log_file = fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.append(true)
|
||||
.open(&log_path)
|
||||
.with_context(|| {
|
||||
format!("Could not open {process_name} log file {log_path:?} for writing")
|
||||
})?;
|
||||
let same_file_for_stderr = process_log_file.try_clone().with_context(|| {
|
||||
format!("Could not reuse {process_name} log file {log_path:?} for writing stderr")
|
||||
})?;
|
||||
|
||||
let mut command = Command::new(command);
|
||||
let background_command = command
|
||||
.stdout(process_log_file)
|
||||
.stderr(same_file_for_stderr)
|
||||
.args(args);
|
||||
let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
|
||||
|
||||
let mut spawned_process = filled_cmd.spawn().with_context(|| {
|
||||
format!("Could not spawn {process_name}, see console output and log files for details.")
|
||||
})?;
|
||||
let pid = spawned_process.id();
|
||||
let pid = Pid::from_raw(
|
||||
i32::try_from(pid)
|
||||
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
|
||||
);
|
||||
|
||||
let pid_file_to_check = match initial_pid_file {
|
||||
InitialPidFile::Create(target_pid_file_path) => {
|
||||
match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
|
||||
lock_file::LockCreationResult::Created { .. } => {
|
||||
// We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
|
||||
// as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
|
||||
}
|
||||
lock_file::LockCreationResult::AlreadyLocked { .. } => {
|
||||
anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
|
||||
}
|
||||
lock_file::LockCreationResult::CreationFailed(e) => {
|
||||
return Err(e.context(format!(
|
||||
"Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
|
||||
)))
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
|
||||
};
|
||||
|
||||
for retries in 0..RETRIES {
|
||||
match process_started(pid, pid_file_to_check, &process_status_check) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} started, pid: {pid}");
|
||||
return Ok(spawned_process);
|
||||
}
|
||||
Ok(false) => {
|
||||
if retries < 5 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
} else {
|
||||
if retries == 5 {
|
||||
println!() // put a line break after dots for second message
|
||||
}
|
||||
println!("{process_name} has not started yet, retrying ({retries})...");
|
||||
}
|
||||
thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{process_name} failed to start: {e:#}");
|
||||
if let Err(e) = spawned_process.kill() {
|
||||
println!("Could not stop {process_name} subprocess: {e:#}")
|
||||
};
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
|
||||
if !pid_file.exists() {
|
||||
println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
|
||||
return Ok(());
|
||||
}
|
||||
let pid = read_pidfile(pid_file)?;
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping {process_name} with pid {pid} immediately..");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping {process_name} with pid {pid} gracefully..");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(()) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!(
|
||||
"{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"),
|
||||
}
|
||||
|
||||
// Wait until process is gone
|
||||
for _ in 0..RETRIES {
|
||||
match process_has_stopped(pid) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} stopped");
|
||||
if let Err(e) = fs::remove_file(pid_file) {
|
||||
if e.kind() != io::ErrorKind::NotFound {
|
||||
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
Ok(false) => {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
thread::sleep(Duration::from_secs(1))
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{process_name} with pid {pid} failed to stop: {e:#}");
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
|
||||
}
|
||||
|
||||
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
|
||||
|
||||
let var = "LLVM_PROFILE_FILE";
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
filled_cmd = filled_cmd.env(var, val);
|
||||
}
|
||||
|
||||
const RUST_LOG_KEY: &str = "RUST_LOG";
|
||||
if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
|
||||
filled_cmd.env(RUST_LOG_KEY, rust_log_value)
|
||||
} else {
|
||||
filled_cmd
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in [
|
||||
"AWS_ACCESS_KEY_ID",
|
||||
"AWS_SECRET_ACCESS_KEY",
|
||||
"AWS_SESSION_TOKEN",
|
||||
] {
|
||||
if let Ok(value) = std::env::var(env_key) {
|
||||
cmd = cmd.env(env_key, value);
|
||||
}
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
fn process_started<F>(
|
||||
pid: Pid,
|
||||
pid_file_to_check: Option<&Path>,
|
||||
status_check: &F,
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
F: Fn() -> anyhow::Result<bool>,
|
||||
{
|
||||
match status_check() {
|
||||
Ok(true) => match pid_file_to_check {
|
||||
Some(pid_file_path) => {
|
||||
if pid_file_path.exists() {
|
||||
let pid_in_file = read_pidfile(pid_file_path)?;
|
||||
Ok(pid_in_file == pid)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
None => Ok(true),
|
||||
},
|
||||
Ok(false) => Ok(false),
|
||||
Err(e) => anyhow::bail!("process failed to start: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// We expect a file that contains a single integer.
|
||||
fn read_pidfile(pidfile: &Path) -> Result<Pid> {
|
||||
let pid_str = fs::read_to_string(pidfile)
|
||||
.with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
|
||||
let pid: i32 = pid_str
|
||||
.parse()
|
||||
.map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
|
||||
if pid < 1 {
|
||||
bail!("pidfile {pidfile:?} contained bad value '{pid}'");
|
||||
}
|
||||
Ok(Pid::from_raw(pid))
|
||||
}
|
||||
|
||||
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||
match kill(pid, None) {
|
||||
// Process exists, keep waiting
|
||||
Ok(_) => Ok(false),
|
||||
// Process not found, we're done
|
||||
Err(Errno::ESRCH) => Ok(true),
|
||||
Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"),
|
||||
}
|
||||
}
|
||||
@@ -9,8 +9,8 @@ use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use control_plane::compute::ComputeControlPlane;
|
||||
use control_plane::local_env::{EtcdBroker, LocalEnv};
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage::PageServerNode;
|
||||
use control_plane::{etcd, local_env};
|
||||
use pageserver_api::models::TimelineInfo;
|
||||
use pageserver_api::{
|
||||
|
||||
@@ -12,14 +12,15 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use utils::{
|
||||
connstring::connection_host_port,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
|
||||
use crate::pageserver::PageServerNode;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage::PageServerNode;
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
@@ -299,8 +300,7 @@ impl PostgresNode {
|
||||
|
||||
// Configure the node to fetch pages from pageserver
|
||||
let pageserver_connstr = {
|
||||
let config = &self.pageserver.pg_connection_config;
|
||||
let (host, port) = (config.host(), config.port());
|
||||
let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
|
||||
|
||||
// Set up authentication
|
||||
//
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PgConnectionConfig {
|
||||
url: Url,
|
||||
}
|
||||
|
||||
impl PgConnectionConfig {
|
||||
pub fn host(&self) -> &str {
|
||||
self.url.host_str().expect("BUG: no host")
|
||||
}
|
||||
|
||||
pub fn port(&self) -> u16 {
|
||||
self.url.port().expect("BUG: no port")
|
||||
}
|
||||
|
||||
/// Return a `<host>:<port>` string.
|
||||
pub fn raw_address(&self) -> String {
|
||||
format!("{}:{}", self.host(), self.port())
|
||||
}
|
||||
|
||||
/// Connect using postgres protocol with TLS disabled.
|
||||
pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
|
||||
postgres::Client::connect(self.url.as_str(), postgres::NoTls)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for PgConnectionConfig {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let mut url: Url = s.parse()?;
|
||||
|
||||
match url.scheme() {
|
||||
"postgres" | "postgresql" => {}
|
||||
other => anyhow::bail!("invalid scheme: {other}"),
|
||||
}
|
||||
|
||||
// It's not a valid connection url if host is unavailable.
|
||||
if url.host().is_none() {
|
||||
anyhow::bail!(url::ParseError::EmptyHost);
|
||||
}
|
||||
|
||||
// E.g. `postgres:bar`.
|
||||
if url.cannot_be_a_base() {
|
||||
anyhow::bail!("URL cannot be a base");
|
||||
}
|
||||
|
||||
// Set the default PG port if it's missing.
|
||||
if url.port().is_none() {
|
||||
url.set_port(Some(5432))
|
||||
.expect("BUG: couldn't set the default port");
|
||||
}
|
||||
|
||||
Ok(Self { url })
|
||||
}
|
||||
}
|
||||
@@ -1,75 +1,95 @@
|
||||
use std::{fs, path::PathBuf};
|
||||
use std::{
|
||||
fs,
|
||||
path::PathBuf,
|
||||
process::{Command, Stdio},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use nix::{
|
||||
sys::signal::{kill, Signal},
|
||||
unistd::Pid,
|
||||
};
|
||||
|
||||
use crate::{background_process, local_env};
|
||||
use crate::{local_env, read_pidfile};
|
||||
|
||||
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_broker = &env.etcd_broker;
|
||||
println!(
|
||||
"Starting etcd broker using {:?}",
|
||||
etcd_broker.etcd_binary_path
|
||||
"Starting etcd broker using {}",
|
||||
etcd_broker.etcd_binary_path.display()
|
||||
);
|
||||
|
||||
let etcd_data_dir = env.base_data_dir.join("etcd");
|
||||
fs::create_dir_all(&etcd_data_dir)
|
||||
.with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;
|
||||
fs::create_dir_all(&etcd_data_dir).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd data dir: {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let etcd_stdout_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd stout file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let etcd_stderr_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd stderr file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let client_urls = etcd_broker.comma_separated_endpoints();
|
||||
let args = [
|
||||
format!("--data-dir={}", etcd_data_dir.display()),
|
||||
format!("--listen-client-urls={client_urls}"),
|
||||
format!("--advertise-client-urls={client_urls}"),
|
||||
// Set --quota-backend-bytes to keep the etcd virtual memory
|
||||
// size smaller. Our test etcd clusters are very small.
|
||||
// See https://github.com/etcd-io/etcd/issues/7910
|
||||
"--quota-backend-bytes=100000000".to_string(),
|
||||
// etcd doesn't compact (vacuum) with default settings,
|
||||
// enable it to prevent space exhaustion.
|
||||
"--auto-compaction-mode=revision".to_string(),
|
||||
"--auto-compaction-retention=1".to_string(),
|
||||
];
|
||||
|
||||
let pid_file_path = etcd_pid_file_path(env);
|
||||
let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
|
||||
.args(&[
|
||||
format!("--data-dir={}", etcd_data_dir.display()),
|
||||
format!("--listen-client-urls={client_urls}"),
|
||||
format!("--advertise-client-urls={client_urls}"),
|
||||
// Set --quota-backend-bytes to keep the etcd virtual memory
|
||||
// size smaller. Our test etcd clusters are very small.
|
||||
// See https://github.com/etcd-io/etcd/issues/7910
|
||||
"--quota-backend-bytes=100000000".to_string(),
|
||||
])
|
||||
.stdout(Stdio::from(etcd_stdout_file))
|
||||
.stderr(Stdio::from(etcd_stderr_file))
|
||||
.spawn()
|
||||
.context("Failed to spawn etcd subprocess")?;
|
||||
let pid = etcd_process.id();
|
||||
|
||||
let client = reqwest::blocking::Client::new();
|
||||
|
||||
background_process::start_process(
|
||||
"etcd",
|
||||
&etcd_data_dir,
|
||||
&etcd_broker.etcd_binary_path,
|
||||
&args,
|
||||
background_process::InitialPidFile::Create(&pid_file_path),
|
||||
|| {
|
||||
for broker_endpoint in &etcd_broker.broker_endpoints {
|
||||
let request = broker_endpoint
|
||||
.join("health")
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to append /health path to broker endopint {}",
|
||||
broker_endpoint
|
||||
)
|
||||
})
|
||||
.and_then(|url| {
|
||||
client.get(&url.to_string()).build().with_context(|| {
|
||||
format!("Failed to construct request to etcd endpoint {url}")
|
||||
})
|
||||
})?;
|
||||
if client.execute(request).is_ok() {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
},
|
||||
)
|
||||
.context("Failed to spawn etcd subprocess")?;
|
||||
let etcd_pid_file_path = etcd_pid_file_path(env);
|
||||
fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd pid file at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
|
||||
let etcd_path = &env.etcd_broker.etcd_binary_path;
|
||||
println!("Stopping etcd broker at {}", etcd_path.display());
|
||||
|
||||
let etcd_pid_file_path = etcd_pid_file_path(env);
|
||||
let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read etcd pid file at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?);
|
||||
|
||||
kill(pid, Signal::SIGTERM).with_context(|| {
|
||||
format!(
|
||||
"Failed to stop etcd with pid {pid} at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
|
||||
|
||||
@@ -6,12 +6,59 @@
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
|
||||
mod background_process;
|
||||
pub mod compute;
|
||||
pub mod connection;
|
||||
pub mod etcd;
|
||||
pub mod local_env;
|
||||
pub mod pageserver;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
pub mod storage;
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// We expect a file that contains a single integer.
|
||||
/// We return an i32 for compatibility with libc and nix.
|
||||
pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
|
||||
let pid_str = fs::read_to_string(pidfile)
|
||||
.with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
|
||||
let pid: i32 = pid_str
|
||||
.parse()
|
||||
.map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
|
||||
if pid < 1 {
|
||||
bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
|
||||
}
|
||||
Ok(pid)
|
||||
}
|
||||
|
||||
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
|
||||
|
||||
let var = "LLVM_PROFILE_FILE";
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
cmd.env(var, val);
|
||||
}
|
||||
|
||||
const RUST_LOG_KEY: &str = "RUST_LOG";
|
||||
if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
|
||||
cmd.env(RUST_LOG_KEY, rust_log_value)
|
||||
} else {
|
||||
cmd
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in [
|
||||
"AWS_ACCESS_KEY_ID",
|
||||
"AWS_SECRET_ACCESS_KEY",
|
||||
"AWS_SESSION_TOKEN",
|
||||
] {
|
||||
if let Ok(value) = std::env::var(env_key) {
|
||||
cmd = cmd.env(env_key, value);
|
||||
}
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
@@ -226,12 +226,12 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("pageserver")
|
||||
pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
|
||||
Ok(self.neon_distrib_dir.join("pageserver"))
|
||||
}
|
||||
|
||||
pub fn safekeeper_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("safekeeper")
|
||||
pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
|
||||
Ok(self.neon_distrib_dir.join("safekeeper"))
|
||||
}
|
||||
|
||||
pub fn pg_data_dirs_path(&self) -> PathBuf {
|
||||
|
||||
@@ -1,21 +1,23 @@
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Child;
|
||||
use std::process::Command;
|
||||
use std::sync::Arc;
|
||||
use std::{io, result};
|
||||
use std::time::Duration;
|
||||
use std::{io, result, thread};
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::bail;
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use postgres::Config;
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::{http::error::HttpErrorBody, id::NodeId};
|
||||
use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};
|
||||
|
||||
use crate::connection::PgConnectionConfig;
|
||||
use crate::pageserver::PageServerNode;
|
||||
use crate::{
|
||||
background_process,
|
||||
local_env::{LocalEnv, SafekeeperConf},
|
||||
};
|
||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||
use crate::storage::PageServerNode;
|
||||
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SafekeeperHttpError {
|
||||
@@ -61,7 +63,7 @@ pub struct SafekeeperNode {
|
||||
|
||||
pub conf: SafekeeperConf,
|
||||
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub pg_connection_config: Config,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: Client,
|
||||
pub http_base_url: String,
|
||||
@@ -85,15 +87,15 @@ impl SafekeeperNode {
|
||||
}
|
||||
|
||||
/// Construct libpq connection string for connecting to this safekeeper.
|
||||
fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
|
||||
fn safekeeper_connection_config(port: u16) -> Config {
|
||||
// TODO safekeeper authentication not implemented yet
|
||||
format!("postgresql://no_user@127.0.0.1:{port}/no_db")
|
||||
format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
|
||||
.parse()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
|
||||
env.safekeeper_data_dir(&format!("sk{sk_id}"))
|
||||
env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
|
||||
}
|
||||
|
||||
pub fn datadir_path(&self) -> PathBuf {
|
||||
@@ -104,78 +106,91 @@ impl SafekeeperNode {
|
||||
self.datadir_path().join("safekeeper.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> anyhow::Result<Child> {
|
||||
pub fn start(&self) -> anyhow::Result<()> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
self.pg_connection_config.raw_address(),
|
||||
connection_address(&self.pg_connection_config),
|
||||
self.datadir_path().display()
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
|
||||
let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
|
||||
let id = self.id;
|
||||
let datadir = self.datadir_path();
|
||||
|
||||
let id_string = id.to_string();
|
||||
let mut args = vec![
|
||||
"-D",
|
||||
datadir.to_str().with_context(|| {
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
"--id",
|
||||
&id_string,
|
||||
"--listen-pg",
|
||||
&listen_pg,
|
||||
"--listen-http",
|
||||
&listen_http,
|
||||
];
|
||||
let mut cmd = Command::new(self.env.safekeeper_bin()?);
|
||||
fill_rust_env_vars(
|
||||
cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
|
||||
.args(&["--id", self.id.to_string().as_ref()])
|
||||
.args(&["--listen-pg", &listen_pg])
|
||||
.args(&["--listen-http", &listen_http])
|
||||
.arg("--daemonize"),
|
||||
);
|
||||
if !self.conf.sync {
|
||||
args.push("--no-sync");
|
||||
cmd.arg("--no-sync");
|
||||
}
|
||||
|
||||
let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
|
||||
if !comma_separated_endpoints.is_empty() {
|
||||
args.extend(["--broker-endpoints", &comma_separated_endpoints]);
|
||||
cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
|
||||
}
|
||||
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
|
||||
args.extend(["--broker-etcd-prefix", prefix]);
|
||||
cmd.args(&["--broker-etcd-prefix", prefix]);
|
||||
}
|
||||
|
||||
let mut backup_threads = String::new();
|
||||
if let Some(threads) = self.conf.backup_threads {
|
||||
backup_threads = threads.to_string();
|
||||
args.extend(["--backup-threads", &backup_threads]);
|
||||
} else {
|
||||
drop(backup_threads);
|
||||
cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
|
||||
}
|
||||
|
||||
if let Some(ref remote_storage) = self.conf.remote_storage {
|
||||
args.extend(["--remote-storage", remote_storage]);
|
||||
cmd.args(&["--remote-storage", remote_storage]);
|
||||
}
|
||||
|
||||
let key_path = self.env.base_data_dir.join("auth_public_key.pem");
|
||||
if self.conf.auth_enabled {
|
||||
args.extend([
|
||||
"--auth-validation-public-key-path",
|
||||
key_path.to_str().with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
]);
|
||||
cmd.arg("--auth-validation-public-key-path");
|
||||
// PathBuf is better be passed as is, not via `String`.
|
||||
cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
|
||||
}
|
||||
|
||||
background_process::start_process(
|
||||
&format!("safekeeper {id}"),
|
||||
&datadir,
|
||||
&self.env.safekeeper_bin(),
|
||||
&args,
|
||||
background_process::InitialPidFile::Expect(&self.pid_file()),
|
||||
|| match self.check_status() {
|
||||
Ok(()) => Ok(true),
|
||||
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
},
|
||||
)
|
||||
fill_aws_secrets_vars(&mut cmd);
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Safekeeper failed to start. See '{}' for details.",
|
||||
self.datadir_path().join("safekeeper.log").display()
|
||||
);
|
||||
}
|
||||
|
||||
// It takes a while for the safekeeper to start up. Wait until it is
|
||||
// open for business.
|
||||
const RETRIES: i8 = 15;
|
||||
for retries in 1..RETRIES {
|
||||
match self.check_status() {
|
||||
Ok(_) => {
|
||||
println!("\nSafekeeper started");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => {
|
||||
match err {
|
||||
SafekeeperHttpError::Transport(err) => {
|
||||
if err.is_connect() && retries < 5 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
} else {
|
||||
if retries == 5 {
|
||||
println!() // put a line break after dots for second message
|
||||
}
|
||||
println!(
|
||||
"Safekeeper not responding yet, err {} retrying ({})...",
|
||||
err, retries
|
||||
);
|
||||
}
|
||||
}
|
||||
SafekeeperHttpError::Response(msg) => {
|
||||
bail!("safekeeper failed to start: {} ", msg)
|
||||
}
|
||||
}
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("safekeeper failed to start in {} seconds", RETRIES);
|
||||
}
|
||||
|
||||
///
|
||||
@@ -187,11 +202,63 @@ impl SafekeeperNode {
|
||||
/// If the server is not running, returns success
|
||||
///
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(
|
||||
immediate,
|
||||
&format!("safekeeper {}", self.id),
|
||||
&self.pid_file(),
|
||||
)
|
||||
let pid_file = self.pid_file();
|
||||
if !pid_file.exists() {
|
||||
println!("Safekeeper {} is already stopped", self.id);
|
||||
return Ok(());
|
||||
}
|
||||
let pid = read_pidfile(&pid_file)?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping safekeeper {} immediately..", self.id);
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping safekeeper {} gracefully..", self.id);
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!(
|
||||
"Safekeeper with pid {} does not exist, but a PID file was found",
|
||||
pid
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to safekeeper with pid {}: {}",
|
||||
pid,
|
||||
err.desc()
|
||||
),
|
||||
}
|
||||
|
||||
// Wait until process is gone
|
||||
for i in 0..600 {
|
||||
let signal = None; // Send no signal, just get the error code
|
||||
match kill(pid, signal) {
|
||||
Ok(_) => (), // Process exists, keep waiting
|
||||
Err(Errno::ESRCH) => {
|
||||
// Process not found, we're done
|
||||
println!("done!");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to pageserver with pid {}: {}",
|
||||
pid,
|
||||
err.desc()
|
||||
),
|
||||
};
|
||||
|
||||
if i % 10 == 0 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
|
||||
bail!("Failed to stop safekeeper with pid {}", pid);
|
||||
}
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
|
||||
@@ -1,27 +1,33 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::{self, File};
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, Write};
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Child;
|
||||
use std::{io, result};
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
use std::{io, result, thread};
|
||||
|
||||
use crate::connection::PgConnectionConfig;
|
||||
use anyhow::{bail, Context};
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use postgres::{Config, NoTls};
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverHttpError {
|
||||
@@ -69,7 +75,7 @@ impl ResponseErrorMessageExt for Response {
|
||||
//
|
||||
#[derive(Debug)]
|
||||
pub struct PageServerNode {
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub pg_connection_config: Config,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: Client,
|
||||
pub http_base_url: String,
|
||||
@@ -95,7 +101,7 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
/// Construct libpq connection string for connecting to the pageserver.
|
||||
fn pageserver_connection_config(password: &str, listen_addr: &str) -> PgConnectionConfig {
|
||||
fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
|
||||
format!("postgresql://no_user:{password}@{listen_addr}/no_db")
|
||||
.parse()
|
||||
.unwrap()
|
||||
@@ -155,15 +161,7 @@ impl PageServerNode {
|
||||
init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
|
||||
}
|
||||
|
||||
let mut pageserver_process = self
|
||||
.start_node(&init_config_overrides, &self.env.base_data_dir, true)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to start a process for pageserver {}",
|
||||
self.env.pageserver.id,
|
||||
)
|
||||
})?;
|
||||
|
||||
self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?;
|
||||
let init_result = self
|
||||
.try_init_timeline(create_tenant, initial_timeline_id, pg_version)
|
||||
.context("Failed to create initial tenant and timeline for pageserver");
|
||||
@@ -173,29 +171,7 @@ impl PageServerNode {
|
||||
}
|
||||
Err(e) => eprintln!("{e:#}"),
|
||||
}
|
||||
match pageserver_process.kill() {
|
||||
Err(e) => {
|
||||
eprintln!(
|
||||
"Failed to stop pageserver {} process with pid {}: {e:#}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
)
|
||||
}
|
||||
Ok(()) => {
|
||||
println!(
|
||||
"Stopped pageserver {} process with pid {}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
);
|
||||
// cleanup after pageserver startup, since we do not call regular `stop_process` during init
|
||||
let pid_file = self.pid_file();
|
||||
if let Err(e) = fs::remove_file(&pid_file) {
|
||||
if e.kind() != io::ErrorKind::NotFound {
|
||||
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.stop(false)?;
|
||||
init_result
|
||||
}
|
||||
|
||||
@@ -220,14 +196,11 @@ impl PageServerNode {
|
||||
self.env.pageserver_data_dir()
|
||||
}
|
||||
|
||||
/// The pid file is created by the pageserver process, with its pid stored inside.
|
||||
/// Other pageservers cannot lock the same file and overwrite it for as long as the current
|
||||
/// pageserver runs. (Unless someone removes the file manually; never do that!)
|
||||
fn pid_file(&self) -> PathBuf {
|
||||
pub fn pid_file(&self) -> PathBuf {
|
||||
self.repo_path().join("pageserver.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
|
||||
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
self.start_node(config_overrides, &self.repo_path(), false)
|
||||
}
|
||||
|
||||
@@ -236,10 +209,10 @@ impl PageServerNode {
|
||||
config_overrides: &[&str],
|
||||
datadir: &Path,
|
||||
update_config: bool,
|
||||
) -> anyhow::Result<Child> {
|
||||
) -> anyhow::Result<()> {
|
||||
println!(
|
||||
"Starting pageserver at '{}' in '{}'",
|
||||
self.pg_connection_config.raw_address(),
|
||||
connection_address(&self.pg_connection_config),
|
||||
datadir.display()
|
||||
);
|
||||
io::stdout().flush()?;
|
||||
@@ -247,7 +220,10 @@ impl PageServerNode {
|
||||
let mut args = vec![
|
||||
"-D",
|
||||
datadir.to_str().with_context(|| {
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
format!(
|
||||
"Datadir path '{}' cannot be represented as a unicode string",
|
||||
datadir.display()
|
||||
)
|
||||
})?,
|
||||
];
|
||||
|
||||
@@ -259,18 +235,48 @@ impl PageServerNode {
|
||||
args.extend(["-c", config_override]);
|
||||
}
|
||||
|
||||
background_process::start_process(
|
||||
"pageserver",
|
||||
datadir,
|
||||
&self.env.pageserver_bin(),
|
||||
&args,
|
||||
background_process::InitialPidFile::Expect(&self.pid_file()),
|
||||
|| match self.check_status() {
|
||||
Ok(()) => Ok(true),
|
||||
Err(PageserverHttpError::Transport(_)) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||
},
|
||||
)
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
|
||||
filled_cmd = fill_aws_secrets_vars(filled_cmd);
|
||||
|
||||
if !filled_cmd.status()?.success() {
|
||||
bail!(
|
||||
"Pageserver failed to start. See console output and '{}' for details.",
|
||||
datadir.join("pageserver.log").display()
|
||||
);
|
||||
}
|
||||
|
||||
// It takes a while for the page server to start up. Wait until it is
|
||||
// open for business.
|
||||
const RETRIES: i8 = 15;
|
||||
for retries in 1..RETRIES {
|
||||
match self.check_status() {
|
||||
Ok(()) => {
|
||||
println!("\nPageserver started");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => {
|
||||
match err {
|
||||
PageserverHttpError::Transport(err) => {
|
||||
if err.is_connect() && retries < 5 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
} else {
|
||||
if retries == 5 {
|
||||
println!() // put a line break after dots for second message
|
||||
}
|
||||
println!("Pageserver not responding yet, err {err} retrying ({retries})...");
|
||||
}
|
||||
}
|
||||
PageserverHttpError::Response(msg) => {
|
||||
bail!("pageserver failed to start: {msg} ")
|
||||
}
|
||||
}
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("pageserver failed to start in {RETRIES} seconds");
|
||||
}
|
||||
|
||||
///
|
||||
@@ -282,18 +288,69 @@ impl PageServerNode {
|
||||
/// If the server is not running, returns success
|
||||
///
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, "pageserver", &self.pid_file())
|
||||
let pid_file = self.pid_file();
|
||||
if !pid_file.exists() {
|
||||
println!("Pageserver is already stopped");
|
||||
return Ok(());
|
||||
}
|
||||
let pid = Pid::from_raw(read_pidfile(&pid_file)?);
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping pageserver immediately..");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping pageserver gracefully..");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!("Pageserver with pid {pid} does not exist, but a PID file was found");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to pageserver with pid {pid}: {}",
|
||||
err.desc()
|
||||
),
|
||||
}
|
||||
|
||||
// Wait until process is gone
|
||||
for i in 0..600 {
|
||||
let signal = None; // Send no signal, just get the error code
|
||||
match kill(pid, signal) {
|
||||
Ok(_) => (), // Process exists, keep waiting
|
||||
Err(Errno::ESRCH) => {
|
||||
// Process not found, we're done
|
||||
println!("done!");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to pageserver with pid {}: {}",
|
||||
pid,
|
||||
err.desc()
|
||||
),
|
||||
};
|
||||
|
||||
if i % 10 == 0 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {pid}");
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let mut client = self.pg_connection_config.connect_no_tls().unwrap();
|
||||
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
|
||||
|
||||
println!("Pageserver query: '{sql}'");
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
|
||||
pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
|
||||
self.pg_connection_config.connect_no_tls()
|
||||
self.pg_connection_config.connect(NoTls)
|
||||
}
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
@@ -362,11 +419,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<NonZeroU64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
||||
trace_read_requests: settings
|
||||
.remove("trace_read_requests")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -429,11 +481,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<NonZeroU64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
||||
trace_read_requests: settings
|
||||
.get("trace_read_requests")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
@@ -502,7 +549,7 @@ impl PageServerNode {
|
||||
pg_wal: Option<(Lsn, PathBuf)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut client = self.pg_connection_config.connect_no_tls().unwrap();
|
||||
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
|
||||
|
||||
// Init base reader
|
||||
let (start_lsn, base_tarfile_path) = base;
|
||||
@@ -1,13 +0,0 @@
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG COMPUTE_IMAGE=compute-node-v14
|
||||
ARG TAG=latest
|
||||
|
||||
FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
|
||||
|
||||
USER root
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl \
|
||||
jq \
|
||||
netcat
|
||||
|
||||
USER postgres
|
||||
@@ -2,7 +2,6 @@ version: '3'
|
||||
|
||||
services:
|
||||
etcd:
|
||||
restart: always
|
||||
image: quay.io/coreos/etcd:v3.5.4
|
||||
ports:
|
||||
- 2379:2379
|
||||
@@ -10,7 +9,7 @@ services:
|
||||
environment:
|
||||
# This signifficantly speeds up etcd and we anyway don't data persistency there.
|
||||
ETCD_UNSAFE_NO_FSYNC: "1"
|
||||
command:
|
||||
command:
|
||||
- "etcd"
|
||||
- "--auto-compaction-mode=revision"
|
||||
- "--auto-compaction-retention=1"
|
||||
@@ -25,7 +24,6 @@ services:
|
||||
- "--quota-backend-bytes=134217728" # 128 MB
|
||||
|
||||
minio:
|
||||
restart: always
|
||||
image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
|
||||
ports:
|
||||
- 9000:9000
|
||||
@@ -43,7 +41,7 @@ services:
|
||||
entrypoint:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
command:
|
||||
command:
|
||||
- "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
|
||||
echo 'Waiting to start minio...' && sleep 1;
|
||||
done;
|
||||
@@ -53,8 +51,7 @@ services:
|
||||
- minio
|
||||
|
||||
pageserver:
|
||||
restart: always
|
||||
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
|
||||
image: neondatabase/neon:${TAG:-latest}
|
||||
environment:
|
||||
- BROKER_ENDPOINT='http://etcd:2379'
|
||||
- AWS_ACCESS_KEY_ID=minio
|
||||
@@ -80,8 +77,7 @@ services:
|
||||
- minio_create_buckets
|
||||
|
||||
safekeeper1:
|
||||
restart: always
|
||||
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
|
||||
image: neondatabase/neon:${TAG:-latest}
|
||||
environment:
|
||||
- SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
|
||||
- SAFEKEEPER_ID=1
|
||||
@@ -110,8 +106,7 @@ services:
|
||||
- minio_create_buckets
|
||||
|
||||
safekeeper2:
|
||||
restart: always
|
||||
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
|
||||
image: neondatabase/neon:${TAG:-latest}
|
||||
environment:
|
||||
- SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
|
||||
- SAFEKEEPER_ID=2
|
||||
@@ -140,8 +135,7 @@ services:
|
||||
- minio_create_buckets
|
||||
|
||||
safekeeper3:
|
||||
restart: always
|
||||
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
|
||||
image: neondatabase/neon:${TAG:-latest}
|
||||
environment:
|
||||
- SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
|
||||
- SAFEKEEPER_ID=3
|
||||
@@ -170,21 +164,18 @@ services:
|
||||
- minio_create_buckets
|
||||
|
||||
compute:
|
||||
restart: always
|
||||
build:
|
||||
context: ./compute_wrapper/
|
||||
context: ./image/compute
|
||||
args:
|
||||
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
|
||||
- TAG=${TAG:-latest}
|
||||
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
|
||||
- http_proxy=$http_proxy
|
||||
- https_proxy=$https_proxy
|
||||
environment:
|
||||
- PG_VERSION=${PG_VERSION:-14}
|
||||
#- RUST_BACKTRACE=1
|
||||
# Mount the test files directly, for faster editing cycle.
|
||||
volumes:
|
||||
- ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
|
||||
- ./compute_wrapper/shell/:/shell/
|
||||
- ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
|
||||
- ./compute/shell/:/shell/
|
||||
ports:
|
||||
- 55433:55433 # pg protocol handler
|
||||
- 3080:3080 # http endpoints
|
||||
|
||||
@@ -1,60 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# A basic test to ensure Docker images are built correctly.
|
||||
# Build a wrapper around the compute, start all services and runs a simple SQL query.
|
||||
# Repeats the process for all currenly supported Postgres versions.
|
||||
|
||||
# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
|
||||
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
|
||||
# to verify custom image builds (e.g pre-published ones).
|
||||
|
||||
# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
|
||||
|
||||
set -eux -o pipefail
|
||||
|
||||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
|
||||
|
||||
COMPUTE_CONTAINER_NAME=docker-compose-compute-1
|
||||
SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
|
||||
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
|
||||
|
||||
cleanup() {
|
||||
echo "show container information"
|
||||
docker ps
|
||||
docker compose -f $COMPOSE_FILE logs
|
||||
echo "stop containers..."
|
||||
docker compose -f $COMPOSE_FILE down
|
||||
}
|
||||
|
||||
echo "clean up containers if exists"
|
||||
cleanup
|
||||
|
||||
for pg_version in 14 15; do
|
||||
echo "start containers (pg_version=$pg_version)."
|
||||
PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
|
||||
|
||||
echo "wait until the compute is ready. timeout after 60s. "
|
||||
cnt=0
|
||||
while sleep 1; do
|
||||
# check timeout
|
||||
cnt=`expr $cnt + 1`
|
||||
if [ $cnt -gt 60 ]; then
|
||||
echo "timeout before the compute is ready."
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check if the compute is ready
|
||||
set +o pipefail
|
||||
result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
|
||||
set -o pipefail
|
||||
if [ $result -eq 1 ]; then
|
||||
echo "OK. The compute is ready to connect."
|
||||
echo "execute simple queries."
|
||||
docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
|
||||
cleanup
|
||||
break
|
||||
fi
|
||||
done
|
||||
done
|
||||
10
docker-compose/image/compute/Dockerfile
Normal file
10
docker-compose/image/compute/Dockerfile
Normal file
@@ -0,0 +1,10 @@
|
||||
ARG COMPUTE_IMAGE=compute-node-v14:latest
|
||||
FROM neondatabase/${COMPUTE_IMAGE}
|
||||
|
||||
USER root
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl \
|
||||
jq \
|
||||
netcat
|
||||
|
||||
USER postgres
|
||||
@@ -1,25 +0,0 @@
|
||||
# Neon broker
|
||||
|
||||
Neon storage broker targets two issues:
|
||||
- Allowing safekeepers and pageservers learn which nodes also hold their
|
||||
timelines, and timeline statuses there.
|
||||
- Avoiding O(n^2) connections between storage nodes while doing so.
|
||||
|
||||
This is used
|
||||
- By pageservers to determine the most advanced and alive safekeeper to pull WAL from.
|
||||
- By safekeepers to synchronize on the timeline: advance
|
||||
`remote_consistent_lsn`, `backup_lsn`, choose who offloads WAL to s3.
|
||||
|
||||
Technically, it is a simple stateless pub-sub message broker based on tonic
|
||||
(grpc) making multiplexing easy. Since it is stateless, fault tolerance can be
|
||||
provided by k8s; there is no built in replication support, though it is not hard
|
||||
to add.
|
||||
|
||||
Currently, the only message is `SafekeeperTimelineInfo`. Each safekeeper, for
|
||||
each active timeline, once in a while pushes timeline status to the broker.
|
||||
Other nodes subscribe and receive this info, using it per above.
|
||||
|
||||
grpcurl can be used to check which values are currently being pushed:
|
||||
```
|
||||
grpcurl -proto broker/proto/broker.proto -d '{"all":{}}' -plaintext localhost:50051 neon_broker.NeonBroker/SubscribeSafekeeperInfo
|
||||
```
|
||||
@@ -1,246 +0,0 @@
|
||||
# Coordinating access of multiple pageservers to the same s3 data
|
||||
|
||||
## Motivation
|
||||
|
||||
There are some blind spots around coordinating access of multiple pageservers
|
||||
to the same s3 data. Currently this is applicable only to tenant relocation
|
||||
case, but in the future we'll need to solve similar problems for
|
||||
replica/standby pageservers.
|
||||
|
||||
## Impacted components (e.g. pageserver, safekeeper, console, etc)
|
||||
|
||||
Pageserver
|
||||
|
||||
## The problem
|
||||
|
||||
### Relocation
|
||||
|
||||
During relocation both pageservers can write to s3. This should be ok for all
|
||||
data except the `index_part.json`. For index part it causes problems during
|
||||
compaction/gc because they remove files from index/s3.
|
||||
|
||||
Imagine this case:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant PS1
|
||||
participant S3
|
||||
participant PS2
|
||||
|
||||
PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
|
||||
PS2->>S3: Attach called, sees L1, L2
|
||||
PS1->>S3: Compaction comes <br/> Removes L1, adds L3
|
||||
note over S3: Index now L2, L3
|
||||
PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
|
||||
note over S3: Index now L1, L2, L4
|
||||
```
|
||||
|
||||
At this point it is not possible to restore from index, it contains L2 which
|
||||
is no longer available in s3 and doesnt contain L3 added by compaction by the
|
||||
first pageserver. So if any of the pageservers restart initial sync will fail
|
||||
(or in on-demand world it will fail a bit later during page request from
|
||||
missing layer)
|
||||
|
||||
### Standby pageserver
|
||||
|
||||
Another related case is standby pageserver. In this case second pageserver can
|
||||
be used as a replica to scale reads and serve as a failover target in case
|
||||
first one fails.
|
||||
|
||||
In this mode second pageserver needs to have the same picture of s3 files to
|
||||
be able to load layers on-demand. To accomplish that second pageserver
|
||||
cannot run gc/compaction jobs. Instead it needs to receive updates for index
|
||||
contents. (There is no need to run walreceiver on the second pageserver then).
|
||||
|
||||
## Observations
|
||||
|
||||
- If both pageservers ingest wal then their layer set diverges, because layer
|
||||
file generation is not deterministic
|
||||
- If one of the pageservers does not ingest wal (and just picks up layer
|
||||
updates) then it lags behind and cannot really answer queries in the same
|
||||
pace as the primary one
|
||||
- Can compaction help make layers deterministic? E g we do not upload level
|
||||
zero layers and construction of higher levels should be deterministic.
|
||||
This way we can guarantee that layer creation by timeout wont mess things up.
|
||||
This way one pageserver uploads data and second one can just ingest it.
|
||||
But we still need some form of election
|
||||
|
||||
## Solutions
|
||||
|
||||
### Manual orchestration
|
||||
|
||||
One possible solution for relocation case is to orchestrate background jobs
|
||||
from outside. The oracle who runs migration can turn off background jobs on
|
||||
PS1 before migration and then run migration -> enable them on PS2. The problem
|
||||
comes if migration fails. In this case in order to resume background jobs
|
||||
oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
|
||||
respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
|
||||
without human ensuring that no upload from PS2 can happen. In order to be able
|
||||
to resolve this automatically CAS is required on S3 side so pageserver can
|
||||
avoid overwriting index part if it is no longer the leading one
|
||||
|
||||
Note that flag that disables background jobs needs to be persistent, because
|
||||
otherwise pageserver restart will clean it
|
||||
|
||||
### Avoid index_part.json
|
||||
|
||||
Index part consists of two parts, list of layers and metadata. List of layers
|
||||
can be easily obtained by `ListObjects` S3 API method. But what to do with
|
||||
metadata? Create metadata instance for each checkpoint and add some counter
|
||||
to the file name?
|
||||
|
||||
Back to potentially long s3 ls.
|
||||
|
||||
### Coordination based approach
|
||||
|
||||
Do it like safekeepers chose leader for WAL upload. Ping each other and decide
|
||||
based on some heuristics e g smallest node id. During relocation PS1 sends
|
||||
"resign" ping message so others can start election without waiting for a timeout.
|
||||
|
||||
This still leaves metadata question open and non deterministic layers are a
|
||||
problem as well
|
||||
|
||||
### Avoid metadata file
|
||||
|
||||
One way to eliminate metadata file is to store it in layer files under some
|
||||
special key. This may resonate with intention to keep all relation sizes in
|
||||
some special segment to avoid initial download during size calculation.
|
||||
Maybe with that we can even store pre calculated value.
|
||||
|
||||
As a downside each checkpoint gets 512 bytes larger.
|
||||
|
||||
If we entirely avoid metadata file this opens up many approaches
|
||||
|
||||
* * *
|
||||
|
||||
During discussion it seems that we converged on the approach consisting of:
|
||||
|
||||
- index files stored per pageserver in the same timeline directory. With that
|
||||
index file name starts to look like: `<pageserver_node_id>_index_part.json`.
|
||||
In such set up there are no concurrent overwrites of index file by different
|
||||
pageservers.
|
||||
- For replica pageservers the solution would be for primary to broadcast index
|
||||
changes to any followers with an ability to check index files in s3 and
|
||||
restore the full state. To properly merge changes with index files we can use
|
||||
a counter that is persisted in an index file, is incremented on every change
|
||||
to it and passed along with broadcasted change. This way we can determine
|
||||
whether we need to apply change to the index state or not.
|
||||
- Responsibility for running background jobs is assigned externally. Pageserver
|
||||
keeps locally persistent flag for each tenant that indicates whether this
|
||||
pageserver is considered as primary one or not. TODO what happends if we
|
||||
crash and cannot start for some extended period of time? Control plane can
|
||||
assign ownership to some other pageserver. Pageserver needs some way to check
|
||||
if its still the blessed one. Maybe by explicit request to control plane on
|
||||
start.
|
||||
|
||||
Requirement for deterministic layer generation was considered overly strict
|
||||
because of two reasons:
|
||||
|
||||
- It can limit possible optimizations e g when pageserver wants to reshuffle
|
||||
some data locally and doesnt want to coordinate this
|
||||
- The deterministic algorithm itself can change so during deployments for some
|
||||
time there will be two different version running at the same time which can
|
||||
cause non determinism
|
||||
|
||||
### External elections
|
||||
|
||||
The above case with lost state in this schema with externally managed
|
||||
leadership is represented like this:
|
||||
|
||||
Note that here we keep objects list in the index file.
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant PS1
|
||||
participant CP as Control Plane
|
||||
participant S3
|
||||
participant PS2
|
||||
|
||||
note over PS1,PS2: PS1 starts up and still a leader
|
||||
PS1->>CP: Am I still the leader for Tenant X?
|
||||
activate CP
|
||||
CP->>PS1: Yes
|
||||
deactivate CP
|
||||
PS1->>S3: Fetch PS1 index.
|
||||
note over PS1: Continue operations, start backround jobs
|
||||
note over PS1,PS2: PS1 starts up and still and is not a leader anymore
|
||||
PS1->>CP: Am I still the leader for Tenant X?
|
||||
CP->>PS1: No
|
||||
PS1->>PS2: Subscribe to index changes
|
||||
PS1->>S3: Fetch PS1 and PS2 indexes
|
||||
note over PS1: Combine index file to include layers <br> from both indexes to be able <br> to see newer files from leader (PS2)
|
||||
note over PS1: Continue operations, do not start background jobs
|
||||
```
|
||||
|
||||
### Internal elections
|
||||
|
||||
To manage leadership internally we can use broker to exchange pings so nodes
|
||||
can decide on the leader roles. In case multiple pageservers are active leader
|
||||
is the one with lowest node id.
|
||||
|
||||
Operations with internally managed elections:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant PS1
|
||||
participant S3
|
||||
|
||||
note over PS1: Starts up
|
||||
note over PS1: Subscribes to changes, waits for two ping <br> timeouts to see if there is a leader
|
||||
PS1->>S3: Fetch indexes from s3
|
||||
alt there is a leader
|
||||
note over PS1: do not start background jobs, <br> continue applying index updates
|
||||
else there is no leader
|
||||
note over PS1: start background jobs, <br> broadcast index changes
|
||||
end
|
||||
|
||||
note over PS1,S3: Then the picture is similar to external elections <br> the difference is that follower can become a leader <br> if there are no pings after some timeout new leader gets elected
|
||||
```
|
||||
|
||||
### Eviction
|
||||
|
||||
When two pageservers operate on a tenant for extended period of time follower
|
||||
doesnt perform write operations in s3. When layer is evicted follower relies
|
||||
on updates from primary to get info about layers it needs to cover range for
|
||||
evicted layer.
|
||||
|
||||
Note that it wont match evicted layer exactly, so layers will overlap and
|
||||
lookup code needs to correctly handle that.
|
||||
|
||||
### Relocation flow
|
||||
|
||||
Actions become:
|
||||
|
||||
- Attach tenant to new pageserver
|
||||
- New pageserver becomes follower since previous one is still leading
|
||||
- New pageserver starts replicating from safekeepers but does not upload layers
|
||||
- Detach is called on the old one
|
||||
- New pageserver becomes leader after it realizes that old one disappeared
|
||||
|
||||
### Index File
|
||||
|
||||
Using `s3 ls` on startup simplifies things, but we still need metadata, so we
|
||||
need to fetch index files anyway. If they contain list of files we can combine
|
||||
them and avoid costly `s3 ls`
|
||||
|
||||
### Remaining issues
|
||||
|
||||
- More than one remote consistent lsn for safekeepers to know
|
||||
|
||||
Anything else?
|
||||
|
||||
### Proposed solution
|
||||
|
||||
To recap. On meeting we converged on approach with external elections but I
|
||||
think it will be overall harder to manage and will introduce a dependency on
|
||||
control plane for pageserver. Using separate index files for each pageserver
|
||||
consisting of log of operations and a metadata snapshot should be enough.
|
||||
|
||||
### What we need to get there?
|
||||
|
||||
- Change index file structure to contain log of changes instead of just the
|
||||
file list
|
||||
- Implement pinging/elections for pageservers
|
||||
@@ -2,11 +2,6 @@
|
||||
|
||||
Below you will find a brief overview of each subdir in the source tree in alphabetical order.
|
||||
|
||||
`broker`:
|
||||
|
||||
Neon storage broker, providing messaging between safekeepers and pageservers.
|
||||
[broker.md](./broker.md)
|
||||
|
||||
`/control_plane`:
|
||||
|
||||
Local control plane.
|
||||
@@ -57,10 +52,6 @@ PostgreSQL extension that implements storage manager API and network communicati
|
||||
|
||||
PostgreSQL extension that contains functions needed for testing and debugging.
|
||||
|
||||
`/pgxn/neon_walredo`:
|
||||
|
||||
Library to run Postgres as a "WAL redo process" in the pageserver.
|
||||
|
||||
`/safekeeper`:
|
||||
|
||||
The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
|
||||
|
||||
@@ -9,7 +9,6 @@ serde_with = "2.0"
|
||||
const_format = "0.2.21"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::num::NonZeroU64;
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::{
|
||||
@@ -10,7 +9,7 @@ use utils::{
|
||||
|
||||
use crate::reltag::RelTag;
|
||||
use anyhow::bail;
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
|
||||
/// A state of a tenant in pageserver's memory.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -73,7 +72,6 @@ pub struct TenantCreateRequest {
|
||||
pub walreceiver_connect_timeout: Option<String>,
|
||||
pub lagging_wal_timeout: Option<String>,
|
||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
pub trace_read_requests: Option<bool>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -113,7 +111,6 @@ pub struct TenantConfigRequest {
|
||||
pub walreceiver_connect_timeout: Option<String>,
|
||||
pub lagging_wal_timeout: Option<String>,
|
||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
pub trace_read_requests: Option<bool>,
|
||||
}
|
||||
|
||||
impl TenantConfigRequest {
|
||||
@@ -132,7 +129,6 @@ impl TenantConfigRequest {
|
||||
walreceiver_connect_timeout: None,
|
||||
lagging_wal_timeout: None,
|
||||
max_lsn_wal_lag: None,
|
||||
trace_read_requests: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -229,7 +225,6 @@ pub struct TimelineGcRequest {
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(PartialEq, Eq)]
|
||||
pub enum PagestreamFeMessage {
|
||||
Exists(PagestreamExistsRequest),
|
||||
Nblocks(PagestreamNblocksRequest),
|
||||
@@ -246,21 +241,21 @@ pub enum PagestreamBeMessage {
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamExistsRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamNblocksRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamGetPageRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
@@ -268,7 +263,7 @@ pub struct PagestreamGetPageRequest {
|
||||
pub blkno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamDbSizeRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
@@ -301,98 +296,52 @@ pub struct PagestreamDbSizeResponse {
|
||||
}
|
||||
|
||||
impl PagestreamFeMessage {
|
||||
pub fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
match self {
|
||||
Self::Exists(req) => {
|
||||
bytes.put_u8(0);
|
||||
bytes.put_u8(if req.latest { 1 } else { 0 });
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
}
|
||||
|
||||
Self::Nblocks(req) => {
|
||||
bytes.put_u8(1);
|
||||
bytes.put_u8(if req.latest { 1 } else { 0 });
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
}
|
||||
|
||||
Self::GetPage(req) => {
|
||||
bytes.put_u8(2);
|
||||
bytes.put_u8(if req.latest { 1 } else { 0 });
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
bytes.put_u32(req.blkno);
|
||||
}
|
||||
|
||||
Self::DbSize(req) => {
|
||||
bytes.put_u8(3);
|
||||
bytes.put_u8(if req.latest { 1 } else { 0 });
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u32(req.dbnode);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
|
||||
pub fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// TODO these gets can fail
|
||||
|
||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||
//
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
let msg_tag = body.read_u8()?;
|
||||
let msg_tag = body.get_u8();
|
||||
match msg_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
latest: body.read_u8()? != 0,
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
},
|
||||
})),
|
||||
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
latest: body.read_u8()? != 0,
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
},
|
||||
})),
|
||||
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
latest: body.read_u8()? != 0,
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
},
|
||||
blkno: body.read_u32::<BigEndian>()?,
|
||||
blkno: body.get_u32(),
|
||||
})),
|
||||
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
latest: body.read_u8()? != 0,
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
dbnode: body.get_u32(),
|
||||
})),
|
||||
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
|
||||
_ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -431,58 +380,3 @@ impl PagestreamBeMessage {
|
||||
bytes.into()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use bytes::Buf;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pagestream() {
|
||||
// Test serialization/deserialization of PagestreamFeMessage
|
||||
let messages = vec![
|
||||
PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
latest: true,
|
||||
lsn: Lsn(4),
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
dbnode: 3,
|
||||
relnode: 4,
|
||||
},
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
latest: false,
|
||||
lsn: Lsn(4),
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
dbnode: 3,
|
||||
relnode: 4,
|
||||
},
|
||||
}),
|
||||
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
latest: true,
|
||||
lsn: Lsn(4),
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
dbnode: 3,
|
||||
relnode: 4,
|
||||
},
|
||||
blkno: 7,
|
||||
}),
|
||||
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
latest: true,
|
||||
lsn: Lsn(4),
|
||||
dbnode: 7,
|
||||
}),
|
||||
];
|
||||
for msg in messages {
|
||||
let bytes = msg.serialize();
|
||||
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
|
||||
assert!(msg == reconstructed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
[package]
|
||||
name = "pq_proto"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
bytes = "1.0.1"
|
||||
pin-project-lite = "0.2.7"
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
rand = "0.8.3"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tracing = "0.1"
|
||||
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
3
libs/tenant_size_model/.gitignore
vendored
3
libs/tenant_size_model/.gitignore
vendored
@@ -1,3 +0,0 @@
|
||||
*.dot
|
||||
*.png
|
||||
*.svg
|
||||
@@ -1,8 +0,0 @@
|
||||
[package]
|
||||
name = "tenant_size_model"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
@@ -1,13 +0,0 @@
|
||||
all: 1.svg 2.svg 3.svg 4.svg 1.png 2.png 3.png 4.png
|
||||
|
||||
../../target/debug/tenant_size_model: Cargo.toml src/main.rs src/lib.rs
|
||||
cargo build --bin tenant_size_model
|
||||
|
||||
%.svg: %.dot
|
||||
dot -Tsvg $< > $@
|
||||
|
||||
%.png: %.dot
|
||||
dot -Tpng $< > $@
|
||||
|
||||
%.dot: ../../target/debug/tenant_size_model
|
||||
../../target/debug/tenant_size_model $* > $@
|
||||
@@ -1,7 +0,0 @@
|
||||
# Logical size + WAL pricing
|
||||
|
||||
This is a simulator to calculate the tenant size in different scenarios,
|
||||
using the "Logical size + WAL" method. Makefile produces diagrams used in a
|
||||
private presentation:
|
||||
|
||||
https://docs.google.com/presentation/d/1OapE4k11xmcwMh7I7YvNWGC63yCRLh6udO9bXZ-fZmo/edit?usp=sharing
|
||||
@@ -1,349 +0,0 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Pricing model or history size builder.
|
||||
///
|
||||
/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
|
||||
/// type.
|
||||
pub struct Storage<K: 'static> {
|
||||
segments: Vec<Segment>,
|
||||
|
||||
/// Mapping from the branch name to the index of a segment describing it's latest state.
|
||||
branches: HashMap<K, usize>,
|
||||
}
|
||||
|
||||
/// Snapshot of a branch.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub struct Segment {
|
||||
/// Previous segment index into ['Storage::segments`], if any.
|
||||
parent: Option<usize>,
|
||||
|
||||
/// Description of how did we get to this state.
|
||||
///
|
||||
/// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
|
||||
/// modifying a branch directly.
|
||||
pub op: Cow<'static, str>,
|
||||
|
||||
/// LSN before this state
|
||||
start_lsn: u64,
|
||||
|
||||
/// LSN at this state
|
||||
pub end_lsn: u64,
|
||||
|
||||
/// Logical size before this state
|
||||
start_size: u64,
|
||||
|
||||
/// Logical size at this state
|
||||
pub end_size: u64,
|
||||
|
||||
/// Indices to [`Storage::segments`]
|
||||
///
|
||||
/// FIXME: this could be an Option<usize>
|
||||
children_after: Vec<usize>,
|
||||
|
||||
/// Determined by `retention_period` given to [`Storage::calculate`]
|
||||
pub needed: bool,
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
// *-g--*---D--->
|
||||
// /
|
||||
// /
|
||||
// / *---b----*-B--->
|
||||
// / /
|
||||
// / /
|
||||
// -----*--e---*-----f----* C
|
||||
// E \
|
||||
// \
|
||||
// *--a---*---A-->
|
||||
//
|
||||
// If A and B need to be retained, is it cheaper to store
|
||||
// snapshot at C+a+b, or snapshots at A and B ?
|
||||
//
|
||||
// If D also needs to be retained, which is cheaper:
|
||||
//
|
||||
// 1. E+g+e+f+a+b
|
||||
// 2. D+C+a+b
|
||||
// 3. D+A+B
|
||||
|
||||
/// [`Segment`] which has had it's size calculated.
|
||||
pub struct SegmentSize {
|
||||
pub seg_id: usize,
|
||||
|
||||
pub method: SegmentMethod,
|
||||
|
||||
this_size: u64,
|
||||
|
||||
pub children: Vec<SegmentSize>,
|
||||
}
|
||||
|
||||
impl SegmentSize {
|
||||
fn total(&self) -> u64 {
|
||||
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
}
|
||||
|
||||
pub fn total_children(&self) -> u64 {
|
||||
if self.method == SnapshotAfter {
|
||||
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
} else {
|
||||
self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Different methods to retain history from a particular state
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub enum SegmentMethod {
|
||||
SnapshotAfter,
|
||||
Wal,
|
||||
WalNeeded,
|
||||
Skipped,
|
||||
}
|
||||
|
||||
use SegmentMethod::*;
|
||||
|
||||
impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
/// Creates a new storage with the given default branch name.
|
||||
pub fn new(initial_branch: K) -> Storage<K> {
|
||||
let init_segment = Segment {
|
||||
op: "".into(),
|
||||
needed: false,
|
||||
parent: None,
|
||||
start_lsn: 0,
|
||||
end_lsn: 0,
|
||||
start_size: 0,
|
||||
end_size: 0,
|
||||
children_after: Vec::new(),
|
||||
};
|
||||
|
||||
Storage {
|
||||
segments: vec![init_segment],
|
||||
branches: HashMap::from([(initial_branch, 0)]),
|
||||
}
|
||||
}
|
||||
|
||||
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
||||
pub fn modify_branch<Q: ?Sized>(
|
||||
&mut self,
|
||||
branch: &Q,
|
||||
op: Cow<'static, str>,
|
||||
lsn_bytes: u64,
|
||||
size_bytes: i64,
|
||||
) where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
{
|
||||
let lastseg_id = *self.branches.get(branch).unwrap();
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
let newseg = Segment {
|
||||
op,
|
||||
parent: Some(lastseg_id),
|
||||
start_lsn: lastseg.end_lsn,
|
||||
end_lsn: lastseg.end_lsn + lsn_bytes,
|
||||
start_size: lastseg.end_size,
|
||||
end_size: (lastseg.end_size as i64 + size_bytes) as u64,
|
||||
children_after: Vec::new(),
|
||||
needed: false,
|
||||
};
|
||||
lastseg.children_after.push(newseg_id);
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
}
|
||||
|
||||
pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
{
|
||||
self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
|
||||
}
|
||||
|
||||
pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
{
|
||||
self.modify_branch(branch, "update".into(), bytes, 0i64);
|
||||
}
|
||||
|
||||
pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
{
|
||||
self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
|
||||
}
|
||||
|
||||
/// Panics if the parent branch cannot be found.
|
||||
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K)
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
{
|
||||
// Find the right segment
|
||||
let branchseg_id = *self
|
||||
.branches
|
||||
.get(parent)
|
||||
.expect("should had found the parent by key");
|
||||
let _branchseg = &mut self.segments[branchseg_id];
|
||||
|
||||
// Create branch name for it
|
||||
self.branches.insert(name, branchseg_id);
|
||||
}
|
||||
|
||||
pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
|
||||
// Phase 1: Mark all the segments that need to be retained
|
||||
for (_branch, &last_seg_id) in self.branches.iter() {
|
||||
let last_seg = &self.segments[last_seg_id];
|
||||
let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
|
||||
let mut seg_id = last_seg_id;
|
||||
loop {
|
||||
let seg = &mut self.segments[seg_id];
|
||||
if seg.end_lsn < cutoff_lsn {
|
||||
break;
|
||||
}
|
||||
seg.needed = true;
|
||||
if let Some(prev_seg_id) = seg.parent {
|
||||
seg_id = prev_seg_id;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: For each oldest segment in a chain that needs to be retained,
|
||||
// calculate if we should store snapshot or WAL
|
||||
self.size_from_snapshot_later(0)
|
||||
}
|
||||
|
||||
fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
|
||||
let seg = &self.segments[seg_id];
|
||||
|
||||
let this_size = seg.end_lsn - seg.start_lsn;
|
||||
|
||||
let mut children = Vec::new();
|
||||
|
||||
// try both ways
|
||||
for &child_id in seg.children_after.iter() {
|
||||
// try each child both ways
|
||||
let child = &self.segments[child_id];
|
||||
let p1 = self.size_from_wal(child_id);
|
||||
|
||||
let p = if !child.needed {
|
||||
let p2 = self.size_from_snapshot_later(child_id);
|
||||
if p1.total() < p2.total() {
|
||||
p1
|
||||
} else {
|
||||
p2
|
||||
}
|
||||
} else {
|
||||
p1
|
||||
};
|
||||
children.push(p);
|
||||
}
|
||||
SegmentSize {
|
||||
seg_id,
|
||||
method: if seg.needed { WalNeeded } else { Wal },
|
||||
this_size,
|
||||
children,
|
||||
}
|
||||
}
|
||||
|
||||
fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
|
||||
// If this is needed, then it's time to do the snapshot and continue
|
||||
// with wal method.
|
||||
let seg = &self.segments[seg_id];
|
||||
//eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
|
||||
if seg.needed {
|
||||
let mut children = Vec::new();
|
||||
|
||||
for &child_id in seg.children_after.iter() {
|
||||
// try each child both ways
|
||||
let child = &self.segments[child_id];
|
||||
let p1 = self.size_from_wal(child_id);
|
||||
|
||||
let p = if !child.needed {
|
||||
let p2 = self.size_from_snapshot_later(child_id);
|
||||
if p1.total() < p2.total() {
|
||||
p1
|
||||
} else {
|
||||
p2
|
||||
}
|
||||
} else {
|
||||
p1
|
||||
};
|
||||
children.push(p);
|
||||
}
|
||||
SegmentSize {
|
||||
seg_id,
|
||||
method: WalNeeded,
|
||||
this_size: seg.start_size,
|
||||
children,
|
||||
}
|
||||
} else {
|
||||
// If any of the direct children are "needed", need to be able to reconstruct here
|
||||
let mut children_needed = false;
|
||||
for &child in seg.children_after.iter() {
|
||||
let seg = &self.segments[child];
|
||||
if seg.needed {
|
||||
children_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let method1 = if !children_needed {
|
||||
let mut children = Vec::new();
|
||||
for child in seg.children_after.iter() {
|
||||
children.push(self.size_from_snapshot_later(*child));
|
||||
}
|
||||
Some(SegmentSize {
|
||||
seg_id,
|
||||
method: Skipped,
|
||||
this_size: 0,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// If this a junction, consider snapshotting here
|
||||
let method2 = if children_needed || seg.children_after.len() >= 2 {
|
||||
let mut children = Vec::new();
|
||||
for child in seg.children_after.iter() {
|
||||
children.push(self.size_from_wal(*child));
|
||||
}
|
||||
Some(SegmentSize {
|
||||
seg_id,
|
||||
method: SnapshotAfter,
|
||||
this_size: seg.end_size,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
match (method1, method2) {
|
||||
(None, None) => panic!(),
|
||||
(Some(method), None) => method,
|
||||
(None, Some(method)) => method,
|
||||
(Some(method1), Some(method2)) => {
|
||||
if method1.total() < method2.total() {
|
||||
method1
|
||||
} else {
|
||||
method2
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_segments(self) -> Vec<Segment> {
|
||||
self.segments
|
||||
}
|
||||
}
|
||||
@@ -1,268 +0,0 @@
|
||||
//! Tenant size model testing ground.
|
||||
//!
|
||||
//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
|
||||
//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
|
||||
//! into pngs.
|
||||
|
||||
use tenant_size_model::{Segment, SegmentSize, Storage};
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
fn scenario_1() -> (Vec<Segment>, SegmentSize) {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000);
|
||||
|
||||
(storage.into_segments(), size)
|
||||
}
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
fn scenario_2() -> (Vec<Segment>, SegmentSize) {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child");
|
||||
storage.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
storage.update("main", 1_000);
|
||||
|
||||
let size = storage.calculate(1000);
|
||||
|
||||
(storage.into_segments(), size)
|
||||
}
|
||||
|
||||
// Like 2, but more updates on main
|
||||
fn scenario_3() -> (Vec<Segment>, SegmentSize) {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child");
|
||||
storage.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000);
|
||||
|
||||
(storage.into_segments(), size)
|
||||
}
|
||||
|
||||
// Diverged branches
|
||||
fn scenario_4() -> (Vec<Segment>, SegmentSize) {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child");
|
||||
storage.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..8 {
|
||||
storage.update("main", 1_000);
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000);
|
||||
|
||||
(storage.into_segments(), size)
|
||||
}
|
||||
|
||||
fn scenario_5() -> (Vec<Segment>, SegmentSize) {
|
||||
let mut storage = Storage::new("a");
|
||||
storage.insert("a", 5000);
|
||||
storage.branch("a", "b");
|
||||
storage.update("b", 4000);
|
||||
storage.update("a", 2000);
|
||||
storage.branch("a", "c");
|
||||
storage.insert("c", 4000);
|
||||
storage.insert("a", 2000);
|
||||
|
||||
let size = storage.calculate(5000);
|
||||
|
||||
(storage.into_segments(), size)
|
||||
}
|
||||
|
||||
fn scenario_6() -> (Vec<Segment>, SegmentSize) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
const NO_OP: Cow<'static, str> = Cow::Borrowed("");
|
||||
|
||||
let branches = [
|
||||
Some(0x7ff1edab8182025f15ae33482edb590a_u128),
|
||||
Some(0xb1719e044db05401a05a2ed588a3ad3f),
|
||||
Some(0xb68d6691c895ad0a70809470020929ef),
|
||||
];
|
||||
|
||||
// compared to other scenarios, this one uses bytes instead of kB
|
||||
|
||||
let mut storage = Storage::new(None);
|
||||
|
||||
storage.branch(&None, branches[0]); // at 0
|
||||
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
|
||||
storage.branch(&branches[0], branches[1]); // at 108951064
|
||||
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
|
||||
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
|
||||
storage.branch(&branches[0], branches[2]); // at 283415424
|
||||
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
|
||||
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
|
||||
|
||||
let size = storage.calculate(100_000);
|
||||
|
||||
(storage.into_segments(), size)
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args: Vec<String> = std::env::args().collect();
|
||||
|
||||
let scenario = if args.len() < 2 { "1" } else { &args[1] };
|
||||
|
||||
let (segments, size) = match scenario {
|
||||
"1" => scenario_1(),
|
||||
"2" => scenario_2(),
|
||||
"3" => scenario_3(),
|
||||
"4" => scenario_4(),
|
||||
"5" => scenario_5(),
|
||||
"6" => scenario_6(),
|
||||
other => {
|
||||
eprintln!("invalid scenario {}", other);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
graphviz_tree(&segments, &size);
|
||||
}
|
||||
|
||||
fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
|
||||
use tenant_size_model::SegmentMethod::*;
|
||||
|
||||
let seg_id = node.seg_id;
|
||||
let seg = segments.get(seg_id).unwrap();
|
||||
let lsn = seg.end_lsn;
|
||||
let size = seg.end_size;
|
||||
let method = node.method;
|
||||
|
||||
println!(" {{");
|
||||
println!(" node [width=0.1 height=0.1 shape=oval]");
|
||||
|
||||
let tenant_size = node.total_children();
|
||||
|
||||
let penwidth = if seg.needed { 6 } else { 3 };
|
||||
let x = match method {
|
||||
SnapshotAfter =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
|
||||
Wal =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
|
||||
WalNeeded =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
|
||||
Skipped =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
|
||||
};
|
||||
|
||||
println!(" \"seg{seg_id}\" [{x}]");
|
||||
println!(" }}");
|
||||
|
||||
// Recurse. Much of the data is actually on the edge
|
||||
for child in node.children.iter() {
|
||||
let child_id = child.seg_id;
|
||||
graphviz_recurse(segments, child);
|
||||
|
||||
let edge_color = match child.method {
|
||||
SnapshotAfter => "gray",
|
||||
Wal => "black",
|
||||
WalNeeded => "black",
|
||||
Skipped => "gray",
|
||||
};
|
||||
|
||||
println!(" {{");
|
||||
println!(" edge [] ");
|
||||
print!(" \"seg{seg_id}\" -> \"seg{child_id}\" [");
|
||||
print!("color={edge_color}");
|
||||
if child.method == WalNeeded {
|
||||
print!(" penwidth=6");
|
||||
}
|
||||
if child.method == Wal {
|
||||
print!(" penwidth=3");
|
||||
}
|
||||
|
||||
let next = segments.get(child_id).unwrap();
|
||||
|
||||
if next.op.is_empty() {
|
||||
print!(
|
||||
" label=\"{} / {}\"",
|
||||
next.end_lsn - seg.end_lsn,
|
||||
(next.end_size as i128 - seg.end_size as i128)
|
||||
);
|
||||
} else {
|
||||
print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
|
||||
}
|
||||
println!("]");
|
||||
println!(" }}");
|
||||
}
|
||||
}
|
||||
|
||||
fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
|
||||
println!("digraph G {{");
|
||||
println!(" fontname=\"Helvetica,Arial,sans-serif\"");
|
||||
println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]");
|
||||
println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]");
|
||||
println!(" graph [center=1 rankdir=LR]");
|
||||
println!(" edge [dir=none]");
|
||||
|
||||
graphviz_recurse(segments, tree);
|
||||
|
||||
println!("}}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scenarios_return_same_size() {
|
||||
type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
|
||||
let truths: &[(u32, ScenarioFn, _)] = &[
|
||||
(line!(), scenario_1, 8000),
|
||||
(line!(), scenario_2, 9000),
|
||||
(line!(), scenario_3, 13000),
|
||||
(line!(), scenario_4, 16000),
|
||||
(line!(), scenario_5, 17000),
|
||||
(line!(), scenario_6, 333_792_000),
|
||||
];
|
||||
|
||||
for (line, scenario, expected) in truths {
|
||||
let (_, size) = scenario();
|
||||
assert_eq!(*expected, size.total_children(), "scenario on line {line}");
|
||||
}
|
||||
}
|
||||
@@ -9,6 +9,9 @@ anyhow = "1.0"
|
||||
bincode = "1.3"
|
||||
bytes = "1.0.1"
|
||||
hyper = { version = "0.14.7", features = ["full"] }
|
||||
pin-project-lite = "0.2.7"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
routerify = "3"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
@@ -30,8 +33,8 @@ once_cell = "1.13.0"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
|
||||
|
||||
metrics = { path = "../metrics" }
|
||||
pq_proto = { path = "../pq_proto" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
52
libs/utils/src/connstring.rs
Normal file
52
libs/utils/src/connstring.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
use postgres::Config;
|
||||
|
||||
pub fn connection_host_port(config: &Config) -> (String, u16) {
|
||||
assert_eq!(
|
||||
config.get_hosts().len(),
|
||||
1,
|
||||
"only one pair of host and port is supported in connection string"
|
||||
);
|
||||
assert_eq!(
|
||||
config.get_ports().len(),
|
||||
1,
|
||||
"only one pair of host and port is supported in connection string"
|
||||
);
|
||||
let host = match &config.get_hosts()[0] {
|
||||
postgres::config::Host::Tcp(host) => host.as_ref(),
|
||||
postgres::config::Host::Unix(host) => host.to_str().unwrap(),
|
||||
};
|
||||
(host.to_owned(), config.get_ports()[0])
|
||||
}
|
||||
|
||||
pub fn connection_address(config: &Config) -> String {
|
||||
let (host, port) = connection_host_port(config);
|
||||
format!("{}:{}", host, port)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_connection_host_port() {
|
||||
let config: Config = "postgresql://no_user@localhost:64000/no_db"
|
||||
.parse()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
connection_host_port(&config),
|
||||
("localhost".to_owned(), 64000)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "only one pair of host and port is supported in connection string")]
|
||||
fn test_connection_host_port_multiple_ports() {
|
||||
let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db"
|
||||
.parse()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
connection_host_port(&config),
|
||||
("localhost".to_owned(), 64000)
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -3,13 +3,6 @@ use std::{fmt, str::FromStr};
|
||||
use hex::FromHex;
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum IdError {
|
||||
#[error("invalid id length {0}")]
|
||||
SliceParseError(usize),
|
||||
}
|
||||
|
||||
/// Neon ID is a 128-bit random ID.
|
||||
/// Used to represent various identifiers. Provides handy utility methods and impls.
|
||||
@@ -29,15 +22,6 @@ impl Id {
|
||||
Id::from(arr)
|
||||
}
|
||||
|
||||
pub fn from_slice(src: &[u8]) -> Result<Id, IdError> {
|
||||
if src.len() != 16 {
|
||||
return Err(IdError::SliceParseError(src.len()));
|
||||
}
|
||||
let mut id_array = [0u8; 16];
|
||||
id_array.copy_from_slice(src);
|
||||
Ok(id_array.into())
|
||||
}
|
||||
|
||||
pub fn as_arr(&self) -> [u8; 16] {
|
||||
self.0
|
||||
}
|
||||
@@ -116,10 +100,6 @@ macro_rules! id_newtype {
|
||||
$t(Id::get_from_buf(buf))
|
||||
}
|
||||
|
||||
pub fn from_slice(src: &[u8]) -> Result<$t, IdError> {
|
||||
Ok($t(Id::from_slice(src)?))
|
||||
}
|
||||
|
||||
pub fn as_arr(&self) -> [u8; 16] {
|
||||
self.0.as_arr()
|
||||
}
|
||||
@@ -224,17 +204,6 @@ pub struct TenantId(Id);
|
||||
|
||||
id_newtype!(TenantId);
|
||||
|
||||
/// Neon Connection Id identifies long-lived connections (for example a pagestream
|
||||
/// connection with the page_service). Is used for better logging and tracing
|
||||
///
|
||||
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
|
||||
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
|
||||
/// See [`Id`] for alternative ways to serialize it.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
|
||||
pub struct ConnectionId(Id);
|
||||
|
||||
id_newtype!(ConnectionId);
|
||||
|
||||
// A pair uniquely identifying Neon instance.
|
||||
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct TenantTimelineId {
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
//! `utils` is intended to be a place to put code that is shared
|
||||
//! between other crates in this repository.
|
||||
|
||||
#![allow(clippy::manual_range_contains)]
|
||||
|
||||
/// `Lsn` type implements common tasks on Log Sequence Numbers
|
||||
pub mod lsn;
|
||||
/// SeqWait allows waiting for a future sequence number to arrive
|
||||
@@ -15,6 +17,10 @@ pub mod vec_map;
|
||||
pub mod bin_ser;
|
||||
pub mod postgres_backend;
|
||||
pub mod postgres_backend_async;
|
||||
pub mod pq_proto;
|
||||
|
||||
// dealing with connstring parsing and handy access to it's parts
|
||||
pub mod connstring;
|
||||
|
||||
// helper functions for creating and fsyncing
|
||||
pub mod crashsafe;
|
||||
@@ -33,12 +39,13 @@ pub mod sock_split;
|
||||
// common log initialisation routine
|
||||
pub mod logging;
|
||||
|
||||
pub mod lock_file;
|
||||
|
||||
// Misc
|
||||
pub mod accum;
|
||||
pub mod shutdown;
|
||||
|
||||
// Tools for calling certain async methods in sync contexts
|
||||
pub mod sync;
|
||||
|
||||
// Utility for binding TcpListeners with proper socket options.
|
||||
pub mod tcp_listener;
|
||||
|
||||
|
||||
@@ -1,81 +0,0 @@
|
||||
//! A module to create and read lock files. A lock file ensures that only one
|
||||
//! process is running at a time, in a particular directory.
|
||||
//!
|
||||
//! File locking is done using [`fcntl::flock`], which means that holding the
|
||||
//! lock on file only prevents acquiring another lock on it; all other
|
||||
//! operations are still possible on files. Other process can still open, read,
|
||||
//! write, or remove the file, for example.
|
||||
//! If the file is removed while a process is holding a lock on it,
|
||||
//! the process that holds the lock does not get any error or notification.
|
||||
//! Furthermore, you can create a new file with the same name and lock the new file,
|
||||
//! while the old process is still running.
|
||||
//! Deleting the lock file while the locking process is still running is a bad idea!
|
||||
|
||||
use std::{fs, os::unix::prelude::AsRawFd, path::Path};
|
||||
|
||||
use anyhow::Context;
|
||||
use nix::fcntl;
|
||||
|
||||
use crate::crashsafe;
|
||||
|
||||
pub enum LockCreationResult {
|
||||
Created {
|
||||
new_lock_contents: String,
|
||||
file: fs::File,
|
||||
},
|
||||
AlreadyLocked {
|
||||
existing_lock_contents: String,
|
||||
},
|
||||
CreationFailed(anyhow::Error),
|
||||
}
|
||||
|
||||
/// Creates a lock file in the path given and writes the given contents into the file.
|
||||
/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
|
||||
pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
|
||||
let lock_file = match fs::OpenOptions::new()
|
||||
.create(true) // O_CREAT
|
||||
.write(true)
|
||||
.open(lock_file_path)
|
||||
.context("Failed to open lock file")
|
||||
{
|
||||
Ok(file) => file,
|
||||
Err(e) => return LockCreationResult::CreationFailed(e),
|
||||
};
|
||||
|
||||
match fcntl::flock(
|
||||
lock_file.as_raw_fd(),
|
||||
fcntl::FlockArg::LockExclusiveNonblock,
|
||||
) {
|
||||
Ok(()) => {
|
||||
match lock_file
|
||||
.set_len(0)
|
||||
.context("Failed to truncate lockfile")
|
||||
.and_then(|()| {
|
||||
fs::write(lock_file_path, &contents).with_context(|| {
|
||||
format!("Failed to write '{contents}' contents into lockfile")
|
||||
})
|
||||
})
|
||||
.and_then(|()| {
|
||||
crashsafe::fsync_file_and_parent(lock_file_path)
|
||||
.context("Failed to fsync lockfile")
|
||||
}) {
|
||||
Ok(()) => LockCreationResult::Created {
|
||||
new_lock_contents: contents,
|
||||
file: lock_file,
|
||||
},
|
||||
Err(e) => LockCreationResult::CreationFailed(e),
|
||||
}
|
||||
}
|
||||
Err(nix::errno::Errno::EAGAIN) => {
|
||||
match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
|
||||
Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
|
||||
existing_lock_contents,
|
||||
},
|
||||
Err(e) => LockCreationResult::CreationFailed(e),
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,10 @@
|
||||
use std::str::FromStr;
|
||||
use std::{
|
||||
fs::{File, OpenOptions},
|
||||
path::Path,
|
||||
str::FromStr,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{Context, Result};
|
||||
use strum_macros::{EnumString, EnumVariantNames};
|
||||
|
||||
#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
|
||||
@@ -21,8 +25,19 @@ impl LogFormat {
|
||||
})
|
||||
}
|
||||
}
|
||||
pub fn init(
|
||||
log_filename: impl AsRef<Path>,
|
||||
daemonize: bool,
|
||||
log_format: LogFormat,
|
||||
) -> Result<File> {
|
||||
// Don't open the same file for output multiple times;
|
||||
// the different fds could overwrite each other's output.
|
||||
let log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?;
|
||||
|
||||
pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
|
||||
let default_filter_str = "info";
|
||||
|
||||
// We fall back to printing all spans at info-level or above if
|
||||
@@ -30,16 +45,50 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
|
||||
let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str));
|
||||
|
||||
let x: File = log_file.try_clone().unwrap();
|
||||
let base_logger = tracing_subscriber::fmt()
|
||||
.with_env_filter(env_filter)
|
||||
.with_target(false)
|
||||
.with_ansi(false)
|
||||
.with_writer(std::io::stdout);
|
||||
.with_writer(move || -> Box<dyn std::io::Write> {
|
||||
// we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
|
||||
// if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
|
||||
// for example to be in line with docker log command which expects logs comimg from stdout
|
||||
if daemonize {
|
||||
Box::new(x.try_clone().unwrap())
|
||||
} else {
|
||||
Box::new(std::io::stdout())
|
||||
}
|
||||
});
|
||||
|
||||
match log_format {
|
||||
LogFormat::Json => base_logger.json().init(),
|
||||
LogFormat::Plain => base_logger.init(),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(log_file)
|
||||
}
|
||||
|
||||
// #[cfg(test)]
|
||||
// Due to global logger, can't run tests in same process.
|
||||
// So until there's a non-global one, the tests are in ../tests/ as separate files.
|
||||
#[macro_export(local_inner_macros)]
|
||||
macro_rules! test_init_file_logger {
|
||||
($log_level:expr, $log_format:expr) => {{
|
||||
use std::str::FromStr;
|
||||
std::env::set_var("RUST_LOG", $log_level);
|
||||
|
||||
let tmp_dir = tempfile::TempDir::new().unwrap();
|
||||
let log_file_path = tmp_dir.path().join("logfile");
|
||||
|
||||
let log_format = $crate::logging::LogFormat::from_str($log_format).unwrap();
|
||||
let _log_file = $crate::logging::init(&log_file_path, true, log_format).unwrap();
|
||||
|
||||
let log_file = std::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&log_file_path)
|
||||
.unwrap();
|
||||
|
||||
log_file
|
||||
}};
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::seqwait::MonotonicCounter;
|
||||
pub const XLOG_BLCKSZ: u32 = 8192;
|
||||
|
||||
/// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct Lsn(pub u64);
|
||||
|
||||
|
||||
@@ -3,10 +3,10 @@
|
||||
//! implementation determining how to process the queries. Currently its API
|
||||
//! is rather narrow, but we can extend it once required.
|
||||
|
||||
use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
|
||||
use crate::sock_split::{BidiStream, ReadStream, WriteStream};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
//! is rather narrow, but we can extend it once required.
|
||||
|
||||
use crate::postgres_backend::AuthType;
|
||||
use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
|
||||
use rand::Rng;
|
||||
use std::future::Future;
|
||||
use std::net::SocketAddr;
|
||||
|
||||
@@ -2,9 +2,7 @@
|
||||
//! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
|
||||
//! on message formats.
|
||||
|
||||
// Tools for calling certain async methods in sync contexts.
|
||||
pub mod sync;
|
||||
|
||||
use crate::sync::{AsyncishRead, SyncFuture};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use postgres_protocol::PG_EPOCH;
|
||||
@@ -18,7 +16,6 @@ use std::{
|
||||
str,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
use sync::{AsyncishRead, SyncFuture};
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tracing::{trace, warn};
|
||||
|
||||
@@ -201,7 +198,7 @@ impl FeMessage {
|
||||
///
|
||||
/// ```
|
||||
/// # use std::io;
|
||||
/// # use pq_proto::FeMessage;
|
||||
/// # use utils::pq_proto::FeMessage;
|
||||
/// #
|
||||
/// # fn process_message(msg: FeMessage) -> anyhow::Result<()> {
|
||||
/// # Ok(())
|
||||
@@ -305,7 +302,6 @@ impl FeStartupPacket {
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
#[allow(clippy::manual_range_contains)]
|
||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
bail!("invalid message length");
|
||||
}
|
||||
@@ -29,7 +29,7 @@ impl<S, T: Future> SyncFuture<S, T> {
|
||||
/// Example:
|
||||
///
|
||||
/// ```
|
||||
/// # use pq_proto::sync::SyncFuture;
|
||||
/// # use utils::sync::SyncFuture;
|
||||
/// # use std::future::Future;
|
||||
/// # use tokio::io::AsyncReadExt;
|
||||
/// #
|
||||
36
libs/utils/tests/logger_json_test.rs
Normal file
36
libs/utils/tests/logger_json_test.rs
Normal file
@@ -0,0 +1,36 @@
|
||||
// This could be in ../src/logging.rs but since the logger is global, these
|
||||
// can't be run in threads of the same process
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Lines};
|
||||
use tracing::*;
|
||||
use utils::test_init_file_logger;
|
||||
|
||||
fn read_lines(file: File) -> Lines<BufReader<File>> {
|
||||
BufReader::new(file).lines()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_format_has_message_and_custom_field() {
|
||||
std::env::set_var("RUST_LOG", "info");
|
||||
|
||||
let log_file = test_init_file_logger!("info", "json");
|
||||
|
||||
let custom_field: &str = "hi";
|
||||
trace!(custom = %custom_field, "test log message");
|
||||
debug!(custom = %custom_field, "test log message");
|
||||
info!(custom = %custom_field, "test log message");
|
||||
warn!(custom = %custom_field, "test log message");
|
||||
error!(custom = %custom_field, "test log message");
|
||||
|
||||
let lines = read_lines(log_file);
|
||||
for line in lines {
|
||||
let content = line.unwrap();
|
||||
let json_object = serde_json::from_str::<serde_json::Value>(&content).unwrap();
|
||||
|
||||
assert_eq!(json_object["fields"]["custom"], "hi");
|
||||
assert_eq!(json_object["fields"]["message"], "test log message");
|
||||
|
||||
assert_ne!(json_object["level"], "TRACE");
|
||||
assert_ne!(json_object["level"], "DEBUG");
|
||||
}
|
||||
}
|
||||
36
libs/utils/tests/logger_plain_test.rs
Normal file
36
libs/utils/tests/logger_plain_test.rs
Normal file
@@ -0,0 +1,36 @@
|
||||
// This could be in ../src/logging.rs but since the logger is global, these
|
||||
// can't be run in threads of the same process
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Lines};
|
||||
use tracing::*;
|
||||
use utils::test_init_file_logger;
|
||||
|
||||
fn read_lines(file: File) -> Lines<BufReader<File>> {
|
||||
BufReader::new(file).lines()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_plain_format_has_message_and_custom_field() {
|
||||
std::env::set_var("RUST_LOG", "warn");
|
||||
|
||||
let log_file = test_init_file_logger!("warn", "plain");
|
||||
|
||||
let custom_field: &str = "hi";
|
||||
trace!(custom = %custom_field, "test log message");
|
||||
debug!(custom = %custom_field, "test log message");
|
||||
info!(custom = %custom_field, "test log message");
|
||||
warn!(custom = %custom_field, "test log message");
|
||||
error!(custom = %custom_field, "test log message");
|
||||
|
||||
let lines = read_lines(log_file);
|
||||
for line in lines {
|
||||
let content = line.unwrap();
|
||||
serde_json::from_str::<serde_json::Value>(&content).unwrap_err();
|
||||
assert!(content.contains("custom=hi"));
|
||||
assert!(content.contains("test log message"));
|
||||
|
||||
assert!(!content.contains("TRACE"));
|
||||
assert!(!content.contains("DEBUG"));
|
||||
assert!(!content.contains("INFO"));
|
||||
}
|
||||
}
|
||||
@@ -12,61 +12,62 @@ testing = ["fail/failpoints"]
|
||||
profiling = ["pprof"]
|
||||
|
||||
[dependencies]
|
||||
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
byteorder = "1.4.3"
|
||||
bytes = "1.0.1"
|
||||
chrono = "0.4.19"
|
||||
clap = { version = "4.0", features = ["string"] }
|
||||
close_fds = "0.3.2"
|
||||
const_format = "0.2.21"
|
||||
crc32c = "0.6.0"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = "0.5.0"
|
||||
futures = "0.3.13"
|
||||
git-version = "0.3.5"
|
||||
hex = "0.4.3"
|
||||
humantime = "2.1.0"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
nix = "0.25"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13.0"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
rstar = "0.9.3"
|
||||
scopeguard = "1.1.0"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
futures = "0.3.13"
|
||||
hex = "0.4.3"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
clap = { version = "4.0", features = ["string"] }
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
crc32c = "0.6.0"
|
||||
thiserror = "1.0"
|
||||
tar = "0.4.33"
|
||||
humantime = "2.1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "2.0"
|
||||
signal-hook = "0.3.10"
|
||||
svg_fmt = "0.4.1"
|
||||
tar = "0.4.33"
|
||||
thiserror = "1.0"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
|
||||
toml_edit = { version = "0.14", features = ["easy"] }
|
||||
tracing = "0.1.36"
|
||||
url = "2"
|
||||
walkdir = "2.3.2"
|
||||
humantime-serde = "1.1.1"
|
||||
|
||||
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
|
||||
|
||||
toml_edit = { version = "0.14", features = ["easy"] }
|
||||
scopeguard = "1.1.0"
|
||||
const_format = "0.2.21"
|
||||
tracing = "0.1.36"
|
||||
signal-hook = "0.3.10"
|
||||
url = "2"
|
||||
nix = "0.25"
|
||||
once_cell = "1.13.0"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = "0.5.0"
|
||||
git-version = "0.3.5"
|
||||
rstar = "0.9.3"
|
||||
num-traits = "0.2.15"
|
||||
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
|
||||
|
||||
etcd_broker = { path = "../libs/etcd_broker" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
pageserver_api = { path = "../libs/pageserver_api" }
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
pq_proto = { path = "../libs/pq_proto" }
|
||||
remote_storage = { path = "../libs/remote_storage" }
|
||||
tenant_size_model = { path = "../libs/tenant_size_model" }
|
||||
etcd_broker = { path = "../libs/etcd_broker" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
utils = { path = "../libs/utils" }
|
||||
remote_storage = { path = "../libs/remote_storage" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
close_fds = "0.3.2"
|
||||
walkdir = "2.3.2"
|
||||
svg_fmt = "0.4.1"
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.4"
|
||||
|
||||
@@ -1,14 +1,17 @@
|
||||
//! Main entry point for the Page Server executable.
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use std::{env, ops::ControlFlow, path::Path, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use fail::FailScenario;
|
||||
use nix::unistd::Pid;
|
||||
use tracing::*;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use fail::FailScenario;
|
||||
use metrics::set_build_info_metric;
|
||||
|
||||
use pageserver::{
|
||||
config::{defaults::*, PageServerConf},
|
||||
http, page_cache, page_service, profiling, task_mgr,
|
||||
@@ -16,22 +19,20 @@ use pageserver::{
|
||||
task_mgr::{
|
||||
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
|
||||
},
|
||||
tenant_mgr, virtual_file,
|
||||
tenant_mgr, virtual_file, LOG_FILE_NAME,
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
lock_file, logging,
|
||||
logging,
|
||||
postgres_backend::AuthType,
|
||||
project_git_version,
|
||||
shutdown::exit_now,
|
||||
signals::{self, Signal},
|
||||
tcp_listener,
|
||||
};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
|
||||
const FEATURES: &[&str] = &[
|
||||
#[cfg(feature = "testing")]
|
||||
"testing",
|
||||
@@ -64,7 +65,6 @@ fn main() -> anyhow::Result<()> {
|
||||
let workdir = workdir
|
||||
.canonicalize()
|
||||
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
|
||||
|
||||
let cfg_file_path = workdir.join("pageserver.toml");
|
||||
|
||||
// Set CWD to workdir for non-daemon modes
|
||||
@@ -75,6 +75,8 @@ fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
})?;
|
||||
|
||||
let daemonize = arg_matches.get_flag("daemonize");
|
||||
|
||||
let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
|
||||
ControlFlow::Continue(conf) => conf,
|
||||
ControlFlow::Break(()) => {
|
||||
@@ -100,7 +102,7 @@ fn main() -> anyhow::Result<()> {
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
start_pageserver(conf).context("Failed to start pageserver")?;
|
||||
start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
|
||||
|
||||
scenario.teardown();
|
||||
Ok(())
|
||||
@@ -195,33 +197,11 @@ fn initialize_config(
|
||||
})
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
logging::init(conf.log_format)?;
|
||||
info!("version: {}", version());
|
||||
fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
|
||||
// Initialize logger
|
||||
let log_file = logging::init(LOG_FILE_NAME, daemonize, conf.log_format)?;
|
||||
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
|
||||
lock_file::LockCreationResult::Created {
|
||||
new_lock_contents,
|
||||
file,
|
||||
} => {
|
||||
info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
|
||||
file
|
||||
}
|
||||
lock_file::LockCreationResult::AlreadyLocked {
|
||||
existing_lock_contents,
|
||||
} => anyhow::bail!(
|
||||
"Could not lock pid file; pageserver is already running in {:?} with PID {}",
|
||||
conf.workdir,
|
||||
existing_lock_contents
|
||||
),
|
||||
lock_file::LockCreationResult::CreationFailed(e) => {
|
||||
return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
|
||||
}
|
||||
};
|
||||
// ensure that the lock file is held even if the main thread of the process is panics
|
||||
// we need to release the lock file only when the current process is gone
|
||||
let _ = Box::leak(Box::new(lock_file));
|
||||
info!("version: {}", version());
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
@@ -238,6 +218,33 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
);
|
||||
let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
|
||||
|
||||
// NB: Don't spawn any threads before daemonizing!
|
||||
if daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
// There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
let stdout = log_file
|
||||
.try_clone()
|
||||
.with_context(|| format!("Failed to clone log file '{:?}'", log_file))?;
|
||||
let stderr = log_file;
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file("pageserver.pid")
|
||||
.working_directory(".")
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
|
||||
// XXX: The parent process should exit abruptly right after
|
||||
// it has spawned a child to prevent coverage machinery from
|
||||
// dumping stats into a `profraw` file now owned by the child.
|
||||
// Otherwise, the coverage data will be damaged.
|
||||
match daemonize.exit_action(|| exit_now(0)).start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(err) => bail!("{err}. could not daemonize. bailing."),
|
||||
}
|
||||
}
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
|
||||
// start profiler (if enabled)
|
||||
@@ -340,6 +347,14 @@ fn cli() -> Command {
|
||||
Command::new("Neon page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.version(version())
|
||||
.arg(
|
||||
|
||||
Arg::new("daemonize")
|
||||
.short('d')
|
||||
.long("daemonize")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Run in the background"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("init")
|
||||
.long("init")
|
||||
|
||||
@@ -8,9 +8,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use std::env;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::ConnectionId;
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
@@ -50,9 +48,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_LOG_FORMAT: &str = "plain";
|
||||
|
||||
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
|
||||
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -72,9 +67,6 @@ pub mod defaults {
|
||||
#initial_superuser_name = '{DEFAULT_SUPERUSER}'
|
||||
|
||||
#log_format = '{DEFAULT_LOG_FORMAT}'
|
||||
|
||||
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -140,9 +132,6 @@ pub struct PageServerConf {
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
|
||||
pub log_format: LogFormat,
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
|
||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
@@ -211,8 +200,6 @@ struct PageServerConfigBuilder {
|
||||
broker_endpoints: BuilderValue<Vec<Url>>,
|
||||
|
||||
log_format: BuilderValue<LogFormat>,
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -241,8 +228,6 @@ impl Default for PageServerConfigBuilder {
|
||||
broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
|
||||
broker_endpoints: Set(Vec::new()),
|
||||
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -319,10 +304,6 @@ impl PageServerConfigBuilder {
|
||||
self.log_format = BuilderValue::Set(log_format)
|
||||
}
|
||||
|
||||
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) {
|
||||
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let broker_endpoints = self
|
||||
.broker_endpoints
|
||||
@@ -368,11 +349,6 @@ impl PageServerConfigBuilder {
|
||||
.broker_etcd_prefix
|
||||
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
|
||||
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
|
||||
concurrent_tenant_size_logical_size_queries: self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or(anyhow!(
|
||||
"missing concurrent_tenant_size_logical_size_queries"
|
||||
))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -415,22 +391,6 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn traces_path(&self) -> PathBuf {
|
||||
self.workdir.join("traces")
|
||||
}
|
||||
|
||||
pub fn trace_path(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
connection_id: &ConnectionId,
|
||||
) -> PathBuf {
|
||||
self.traces_path()
|
||||
.join(tenant_id.to_string())
|
||||
.join(timeline_id.to_string())
|
||||
.join(connection_id.to_string())
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
|
||||
@@ -516,12 +476,6 @@ impl PageServerConf {
|
||||
"log_format" => builder.log_format(
|
||||
LogFormat::from_config(&parse_toml_string(key, item)?)?
|
||||
),
|
||||
"concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
|
||||
let input = parse_toml_string(key, item)?;
|
||||
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
|
||||
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
|
||||
ConfigurableSemaphore::new(permits)
|
||||
}),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -635,7 +589,6 @@ impl PageServerConf {
|
||||
broker_endpoints: Vec::new(),
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -701,58 +654,6 @@ fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Configurable semaphore permits setting.
|
||||
///
|
||||
/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
|
||||
/// semaphore cannot be distinguished, leading any feature using these to await forever (or until
|
||||
/// new permits are added).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ConfigurableSemaphore {
|
||||
initial_permits: NonZeroUsize,
|
||||
inner: std::sync::Arc<tokio::sync::Semaphore>,
|
||||
}
|
||||
|
||||
impl ConfigurableSemaphore {
|
||||
pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
|
||||
Some(x) => x,
|
||||
None => panic!("const unwrap is not yet stable"),
|
||||
};
|
||||
|
||||
/// Initializse using a non-zero amount of permits.
|
||||
///
|
||||
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
|
||||
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
|
||||
/// behave like [`futures::future::pending`], just waiting until new permits are added.
|
||||
pub fn new(initial_permits: NonZeroUsize) -> Self {
|
||||
ConfigurableSemaphore {
|
||||
initial_permits,
|
||||
inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ConfigurableSemaphore {
|
||||
fn default() -> Self {
|
||||
Self::new(Self::DEFAULT_INITIAL)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for ConfigurableSemaphore {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
// the number of permits can be increased at runtime, so we cannot really fulfill the
|
||||
// PartialEq value equality otherwise
|
||||
self.initial_permits == other.initial_permits
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for ConfigurableSemaphore {}
|
||||
|
||||
impl ConfigurableSemaphore {
|
||||
pub fn inner(&self) -> &std::sync::Arc<tokio::sync::Semaphore> {
|
||||
&self.inner
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{
|
||||
@@ -824,7 +725,6 @@ log_format = 'json'
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -870,7 +770,6 @@ log_format = 'json'
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
log_format: LogFormat::Json,
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -354,54 +354,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/size:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
get:
|
||||
description: |
|
||||
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
|
||||
responses:
|
||||
"200":
|
||||
description: OK,
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
required:
|
||||
- id
|
||||
- size
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
format: hex
|
||||
size:
|
||||
type: integer
|
||||
description: |
|
||||
Size metric in bytes.
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
|
||||
@@ -227,10 +227,13 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| {
|
||||
let timelines = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
Ok(tenant.list_timelines())
|
||||
})?;
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
@@ -520,7 +523,9 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
// if tenant is in progress of downloading it can be absent in global tenant map
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false);
|
||||
let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false))
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
@@ -566,44 +571,6 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
)
|
||||
}
|
||||
|
||||
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// this can be long operation, it currently is not backed by any request coalescing or similar
|
||||
let inputs = tenant
|
||||
.gather_size_inputs()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
|
||||
|
||||
/// Private response type with the additional "unstable" `inputs` field.
|
||||
///
|
||||
/// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
|
||||
/// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Serialize)]
|
||||
struct TenantHistorySize {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
id: TenantId,
|
||||
/// Size is a mixture of WAL and logical size, so the unit is bytes.
|
||||
size: u64,
|
||||
inputs: crate::tenant::size::ModelInputs,
|
||||
}
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
TenantHistorySize {
|
||||
id: tenant_id,
|
||||
size,
|
||||
inputs,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
// Helper function to standardize the error messages we produce on bad durations
|
||||
//
|
||||
// Intended to be used with anyhow's `with_context`, e.g.:
|
||||
@@ -618,7 +585,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
println!("tenant create: {:?}", request_data.trace_read_requests);
|
||||
let remote_index = get_state(&request).remote_index.clone();
|
||||
|
||||
let mut tenant_conf = TenantConfOpt::default();
|
||||
@@ -660,9 +626,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
|
||||
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
|
||||
}
|
||||
if let Some(trace_read_requests) = request_data.trace_read_requests {
|
||||
tenant_conf.trace_read_requests = Some(trace_read_requests);
|
||||
}
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
|
||||
@@ -750,9 +713,6 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
|
||||
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
|
||||
}
|
||||
if let Some(trace_read_requests) = request_data.trace_read_requests {
|
||||
tenant_conf.trace_read_requests = Some(trace_read_requests);
|
||||
}
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
|
||||
@@ -832,14 +792,14 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let _span_guard =
|
||||
info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
@@ -875,7 +835,6 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
.map_err(ApiError::NotFound)?;
|
||||
timeline
|
||||
.checkpoint(CheckpointConfig::Forced)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -939,7 +898,6 @@ pub fn make_router(
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
.get("/v1/tenant/:tenant_id", tenant_status)
|
||||
.get("/v1/tenant/:tenant_id/size", tenant_size_handler)
|
||||
.put("/v1/tenant/config", tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
||||
|
||||
@@ -15,7 +15,6 @@ pub mod tenant;
|
||||
pub mod tenant_config;
|
||||
pub mod tenant_mgr;
|
||||
pub mod tenant_tasks;
|
||||
pub mod trace;
|
||||
pub mod virtual_file;
|
||||
pub mod walingest;
|
||||
pub mod walreceiver;
|
||||
@@ -44,6 +43,8 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
|
||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||
pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
||||
|
||||
pub const LOG_FILE_NAME: &str = "pageserver.log";
|
||||
|
||||
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
/// Config for the Repository checkpointer
|
||||
@@ -80,6 +81,7 @@ pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
|
||||
// There should be nothing left, but let's be sure
|
||||
task_mgr::shutdown_tasks(None, None, None).await;
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
@@ -31,7 +31,6 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
|
||||
"compact",
|
||||
"create images",
|
||||
"init logical size",
|
||||
"logical size",
|
||||
"load layer map",
|
||||
"gc",
|
||||
];
|
||||
@@ -366,7 +365,6 @@ pub struct TimelineMetrics {
|
||||
pub compact_time_histo: Histogram,
|
||||
pub create_images_time_histo: Histogram,
|
||||
pub init_logical_size_histo: Histogram,
|
||||
pub logical_size_histo: Histogram,
|
||||
pub load_layer_map_histo: Histogram,
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub wait_lsn_time_histo: Histogram,
|
||||
@@ -399,9 +397,6 @@ impl TimelineMetrics {
|
||||
let init_logical_size_histo = STORAGE_TIME
|
||||
.get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let logical_size_histo = STORAGE_TIME
|
||||
.get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let load_layer_map_histo = STORAGE_TIME
|
||||
.get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -433,7 +428,6 @@ impl TimelineMetrics {
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
init_logical_size_histo,
|
||||
logical_size_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
wait_lsn_time_histo,
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
//
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Buf;
|
||||
use bytes::Bytes;
|
||||
use futures::{Stream, StreamExt};
|
||||
use pageserver_api::models::{
|
||||
@@ -19,23 +18,21 @@ use pageserver_api::models::{
|
||||
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
|
||||
PagestreamNblocksRequest, PagestreamNblocksResponse,
|
||||
};
|
||||
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
|
||||
use std::io;
|
||||
use std::net::TcpListener;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use tokio::pin;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::io::SyncIoBridge;
|
||||
use tracing::*;
|
||||
use utils::id::ConnectionId;
|
||||
use utils::{
|
||||
auth::{self, Claims, JwtAuth, Scope},
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
postgres_backend_async::{self, PostgresBackend},
|
||||
pq_proto::{BeMessage, FeMessage, RowDescriptor},
|
||||
simple_rcu::RcuReadGuard,
|
||||
};
|
||||
|
||||
@@ -48,7 +45,6 @@ use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant_mgr;
|
||||
use crate::trace::Tracer;
|
||||
use crate::CheckpointConfig;
|
||||
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
@@ -271,18 +267,6 @@ impl PageServerHandler {
|
||||
// so there is no need to reset the association
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
|
||||
// Make request tracer if needed
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
|
||||
let mut tracer = if tenant.get_trace_read_requests() {
|
||||
let connection_id = ConnectionId::generate();
|
||||
let path = tenant
|
||||
.conf
|
||||
.trace_path(&tenant_id, &timeline_id, &connection_id);
|
||||
Some(Tracer::new(path))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Check that the timeline exists
|
||||
let timeline = get_local_timeline(tenant_id, timeline_id)?;
|
||||
|
||||
@@ -315,12 +299,7 @@ impl PageServerHandler {
|
||||
|
||||
trace!("query: {copy_data_bytes:?}");
|
||||
|
||||
// Trace request if needed
|
||||
if let Some(t) = tracer.as_mut() {
|
||||
t.trace(©_data_bytes)
|
||||
}
|
||||
|
||||
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
|
||||
let neon_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
|
||||
|
||||
let response = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
@@ -387,12 +366,14 @@ impl PageServerHandler {
|
||||
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
let copyin_stream = copyin_stream(pgb);
|
||||
pin!(copyin_stream);
|
||||
|
||||
timeline
|
||||
.import_basebackup_from_tar(&mut copyin_stream, base_lsn)
|
||||
.await?;
|
||||
// import_basebackup_from_tar() is not async, mainly because the Tar crate
|
||||
// it uses is not async. So we need to jump through some hoops:
|
||||
// - convert the input from client connection to a synchronous Read
|
||||
// - use block_in_place()
|
||||
let mut copyin_stream = Box::pin(copyin_stream(pgb));
|
||||
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
|
||||
tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?;
|
||||
timeline.initialize()?;
|
||||
|
||||
// Drain the rest of the Copy data
|
||||
let mut bytes_after_tar = 0;
|
||||
@@ -457,7 +438,7 @@ impl PageServerHandler {
|
||||
// We only want to persist the data, and it doesn't matter if it's in the
|
||||
// shape of deltas or images.
|
||||
info!("flushing layers");
|
||||
timeline.checkpoint(CheckpointConfig::Flush).await?;
|
||||
timeline.checkpoint(CheckpointConfig::Flush)?;
|
||||
|
||||
info!("done");
|
||||
Ok(())
|
||||
|
||||
@@ -12,12 +12,8 @@
|
||||
//!
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use bytes::Bytes;
|
||||
use futures::Stream;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use tokio::sync::watch;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::io::SyncIoBridge;
|
||||
use tracing::*;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
|
||||
@@ -33,7 +29,6 @@ use std::io::Write;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use std::pin::Pin;
|
||||
use std::process::Command;
|
||||
use std::process::Stdio;
|
||||
use std::sync::Arc;
|
||||
@@ -77,8 +72,6 @@ pub mod storage_layer;
|
||||
|
||||
mod timeline;
|
||||
|
||||
pub mod size;
|
||||
|
||||
use storage_layer::Layer;
|
||||
|
||||
pub use timeline::Timeline;
|
||||
@@ -127,9 +120,6 @@ pub struct Tenant {
|
||||
|
||||
/// Makes every timeline to backup their files to remote storage.
|
||||
upload_layers: bool,
|
||||
|
||||
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
}
|
||||
|
||||
/// A timeline with some of its files on disk, being initialized.
|
||||
@@ -142,7 +132,7 @@ pub struct Tenant {
|
||||
pub struct UninitializedTimeline<'t> {
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
|
||||
raw_timeline: Option<(Timeline, TimelineUninitMark)>,
|
||||
}
|
||||
|
||||
/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
|
||||
@@ -174,6 +164,7 @@ impl UninitializedTimeline<'_> {
|
||||
let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
|
||||
format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
let new_timeline = Arc::new(new_timeline);
|
||||
|
||||
let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
|
||||
// TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least
|
||||
@@ -201,9 +192,6 @@ impl UninitializedTimeline<'_> {
|
||||
})?;
|
||||
new_timeline.set_state(TimelineState::Active);
|
||||
v.insert(Arc::clone(&new_timeline));
|
||||
|
||||
new_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
new_timeline.launch_wal_receiver();
|
||||
}
|
||||
}
|
||||
@@ -212,28 +200,20 @@ impl UninitializedTimeline<'_> {
|
||||
}
|
||||
|
||||
/// Prepares timeline data by loading it from the basebackup archive.
|
||||
pub async fn import_basebackup_from_tar(
|
||||
self,
|
||||
mut copyin_stream: &mut Pin<&mut impl Stream<Item = io::Result<Bytes>>>,
|
||||
pub fn import_basebackup_from_tar(
|
||||
&self,
|
||||
reader: impl std::io::Read,
|
||||
base_lsn: Lsn,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
) -> anyhow::Result<()> {
|
||||
let raw_timeline = self.raw_timeline()?;
|
||||
|
||||
// import_basebackup_from_tar() is not async, mainly because the Tar crate
|
||||
// it uses is not async. So we need to jump through some hoops:
|
||||
// - convert the input from client connection to a synchronous Read
|
||||
// - use block_in_place()
|
||||
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
|
||||
|
||||
tokio::task::block_in_place(|| {
|
||||
import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn)
|
||||
.context("Failed to import basebackup")
|
||||
})?;
|
||||
|
||||
// Flush loop needs to be spawned in order for checkpoint to be able to flush.
|
||||
// We want to run proper checkpoint before we mark timeline as available to outside world
|
||||
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
|
||||
raw_timeline.maybe_spawn_flush_loop();
|
||||
import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"Failed to import basebackup for timeline {}/{}",
|
||||
self.owning_tenant.tenant_id, self.timeline_id
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
bail!("failpoint before-checkpoint-new-timeline");
|
||||
@@ -241,15 +221,16 @@ impl UninitializedTimeline<'_> {
|
||||
|
||||
raw_timeline
|
||||
.checkpoint(CheckpointConfig::Flush)
|
||||
.await
|
||||
.context("Failed to checkpoint after basebackup import")?;
|
||||
|
||||
let timeline = self.initialize()?;
|
||||
|
||||
Ok(timeline)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to checkpoint after basebackup import for timeline {}/{}",
|
||||
self.owning_tenant.tenant_id, self.timeline_id
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
|
||||
fn raw_timeline(&self) -> anyhow::Result<&Timeline> {
|
||||
Ok(&self
|
||||
.raw_timeline
|
||||
.as_ref()
|
||||
@@ -484,7 +465,7 @@ impl Tenant {
|
||||
|
||||
self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
|
||||
}
|
||||
None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
|
||||
None => self.bootstrap_timeline(new_timeline_id, pg_version)?,
|
||||
};
|
||||
|
||||
// Have added new timeline into the tenant, now its background tasks are needed.
|
||||
@@ -502,7 +483,7 @@ impl Tenant {
|
||||
/// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
|
||||
/// to make tests more deterministic.
|
||||
/// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
|
||||
pub async fn gc_iteration(
|
||||
pub fn gc_iteration(
|
||||
&self,
|
||||
target_timeline_id: Option<TimelineId>,
|
||||
horizon: u64,
|
||||
@@ -518,13 +499,11 @@ impl Tenant {
|
||||
.map(|x| x.to_string())
|
||||
.unwrap_or_else(|| "-".to_string());
|
||||
|
||||
{
|
||||
let _timer = STORAGE_TIME
|
||||
.with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
|
||||
.start_timer();
|
||||
self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
|
||||
.await
|
||||
}
|
||||
STORAGE_TIME
|
||||
.with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
|
||||
.observe_closure_duration(|| {
|
||||
self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
|
||||
})
|
||||
}
|
||||
|
||||
/// Perform one compaction iteration.
|
||||
@@ -544,6 +523,7 @@ impl Tenant {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timelines_to_compact = timelines
|
||||
.iter()
|
||||
.filter(|(_, timeline)| timeline.is_active())
|
||||
.map(|(timeline_id, timeline)| (*timeline_id, timeline.clone()))
|
||||
.collect::<Vec<_>>();
|
||||
drop(timelines);
|
||||
@@ -560,24 +540,23 @@ impl Tenant {
|
||||
///
|
||||
/// Used at graceful shutdown.
|
||||
///
|
||||
pub async fn checkpoint(&self) -> anyhow::Result<()> {
|
||||
pub fn checkpoint(&self) -> anyhow::Result<()> {
|
||||
// Scan through the hashmap and collect a list of all the timelines,
|
||||
// while holding the lock. Then drop the lock and actually perform the
|
||||
// checkpoints. We don't want to block everything else while the
|
||||
// checkpoint runs.
|
||||
let timelines_to_checkpoint = {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
timelines
|
||||
.iter()
|
||||
.map(|(id, timeline)| (*id, Arc::clone(timeline)))
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timelines_to_checkpoint = timelines
|
||||
.iter()
|
||||
.map(|(timeline_id, timeline)| (*timeline_id, Arc::clone(timeline)))
|
||||
.collect::<Vec<_>>();
|
||||
drop(timelines);
|
||||
|
||||
for (id, timeline) in &timelines_to_checkpoint {
|
||||
timeline
|
||||
.checkpoint(CheckpointConfig::Flush)
|
||||
.instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id))
|
||||
.await?;
|
||||
for (timeline_id, timeline) in &timelines_to_checkpoint {
|
||||
let _entered =
|
||||
info_span!("checkpoint", timeline = %timeline_id, tenant = %self.tenant_id)
|
||||
.entered();
|
||||
timeline.checkpoint(CheckpointConfig::Flush)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -806,13 +785,6 @@ impl Tenant {
|
||||
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
||||
}
|
||||
|
||||
pub fn get_trace_read_requests(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||
tenant_conf
|
||||
.trace_read_requests
|
||||
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
|
||||
}
|
||||
|
||||
pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
|
||||
self.tenant_conf.write().unwrap().update(&new_tenant_conf);
|
||||
}
|
||||
@@ -863,7 +835,6 @@ impl Tenant {
|
||||
remote_index,
|
||||
upload_layers,
|
||||
state,
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -985,9 +956,8 @@ impl Tenant {
|
||||
// +-----baz-------->
|
||||
//
|
||||
//
|
||||
// 1. Grab 'gc_cs' mutex to prevent new timelines from being created while Timeline's
|
||||
// `gc_infos` are being refreshed
|
||||
// 2. Scan collected timelines, and on each timeline, make note of the
|
||||
// 1. Grab 'gc_cs' mutex to prevent new timelines from being created
|
||||
// 2. Scan all timelines, and on each timeline, make note of the
|
||||
// all the points where other timelines have been branched off.
|
||||
// We will refrain from removing page versions at those LSNs.
|
||||
// 3. For each timeline, scan all layer files on the timeline.
|
||||
@@ -998,7 +968,7 @@ impl Tenant {
|
||||
// - if a relation has a non-incremental persistent layer on a child branch, then we
|
||||
// don't need to keep that in the parent anymore. But currently
|
||||
// we do.
|
||||
async fn gc_iteration_internal(
|
||||
fn gc_iteration_internal(
|
||||
&self,
|
||||
target_timeline_id: Option<TimelineId>,
|
||||
horizon: u64,
|
||||
@@ -1008,68 +978,6 @@ impl Tenant {
|
||||
let mut totals: GcResult = Default::default();
|
||||
let now = Instant::now();
|
||||
|
||||
let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;
|
||||
|
||||
// Perform GC for each timeline.
|
||||
//
|
||||
// Note that we don't hold the GC lock here because we don't want
|
||||
// to delay the branch creation task, which requires the GC lock.
|
||||
// A timeline GC iteration can be slow because it may need to wait for
|
||||
// compaction (both require `layer_removal_cs` lock),
|
||||
// but the GC iteration can run concurrently with branch creation.
|
||||
//
|
||||
// See comments in [`Tenant::branch_timeline`] for more information
|
||||
// about why branch creation task can run concurrently with timeline's GC iteration.
|
||||
for timeline in gc_timelines {
|
||||
if task_mgr::is_shutdown_requested() {
|
||||
// We were requested to shut down. Stop and return with the progress we
|
||||
// made.
|
||||
break;
|
||||
}
|
||||
|
||||
// If requested, force flush all in-memory layers to disk first,
|
||||
// so that they too can be garbage collected. That's
|
||||
// used in tests, so we want as deterministic results as possible.
|
||||
if checkpoint_before_gc {
|
||||
timeline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
info!(
|
||||
"timeline {} checkpoint_before_gc done",
|
||||
timeline.timeline_id
|
||||
);
|
||||
}
|
||||
|
||||
let result = timeline.gc()?;
|
||||
totals += result;
|
||||
}
|
||||
|
||||
totals.elapsed = now.elapsed();
|
||||
Ok(totals)
|
||||
}
|
||||
|
||||
/// Refreshes the Timeline::gc_info for all timelines, returning the
|
||||
/// vector of timelines which have [`Timeline::get_last_record_lsn`] past
|
||||
/// [`Tenant::get_gc_horizon`].
|
||||
///
|
||||
/// This is usually executed as part of periodic gc, but can now be triggered more often.
|
||||
pub fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
|
||||
// since this method can now be called at different rates than the configured gc loop, it
|
||||
// might be that these configuration values get applied faster than what it was previously,
|
||||
// since these were only read from the gc task.
|
||||
let horizon = self.get_gc_horizon();
|
||||
let pitr = self.get_pitr_interval();
|
||||
|
||||
// refresh all timelines
|
||||
let target_timeline_id = None;
|
||||
|
||||
self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
|
||||
}
|
||||
|
||||
fn refresh_gc_info_internal(
|
||||
&self,
|
||||
target_timeline_id: Option<TimelineId>,
|
||||
horizon: u64,
|
||||
pitr: Duration,
|
||||
) -> anyhow::Result<Vec<Arc<Timeline>>> {
|
||||
// grab mutex to prevent new timelines from being created here.
|
||||
let gc_cs = self.gc_cs.lock().unwrap();
|
||||
|
||||
@@ -1087,7 +995,11 @@ impl Tenant {
|
||||
|
||||
timelines
|
||||
.iter()
|
||||
.filter(|(_, timeline)| timeline.is_active())
|
||||
.map(|(timeline_id, timeline_entry)| {
|
||||
// This is unresolved question for now, how to do gc in presence of remote timelines
|
||||
// especially when this is combined with branching.
|
||||
// Somewhat related: https://github.com/neondatabase/neon/issues/999
|
||||
if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
|
||||
// If target_timeline is specified, we only need to know branchpoints of its children
|
||||
if let Some(timeline_id) = target_timeline_id {
|
||||
@@ -1141,7 +1053,41 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
drop(gc_cs);
|
||||
Ok(gc_timelines)
|
||||
|
||||
// Perform GC for each timeline.
|
||||
//
|
||||
// Note that we don't hold the GC lock here because we don't want
|
||||
// to delay the branch creation task, which requires the GC lock.
|
||||
// A timeline GC iteration can be slow because it may need to wait for
|
||||
// compaction (both require `layer_removal_cs` lock),
|
||||
// but the GC iteration can run concurrently with branch creation.
|
||||
//
|
||||
// See comments in [`Tenant::branch_timeline`] for more information
|
||||
// about why branch creation task can run concurrently with timeline's GC iteration.
|
||||
for timeline in gc_timelines {
|
||||
if task_mgr::is_shutdown_requested() {
|
||||
// We were requested to shut down. Stop and return with the progress we
|
||||
// made.
|
||||
break;
|
||||
}
|
||||
|
||||
// If requested, force flush all in-memory layers to disk first,
|
||||
// so that they too can be garbage collected. That's
|
||||
// used in tests, so we want as deterministic results as possible.
|
||||
if checkpoint_before_gc {
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
info!(
|
||||
"timeline {} checkpoint_before_gc done",
|
||||
timeline.timeline_id
|
||||
);
|
||||
}
|
||||
|
||||
let result = timeline.gc()?;
|
||||
totals += result;
|
||||
}
|
||||
|
||||
totals.elapsed = now.elapsed();
|
||||
Ok(totals)
|
||||
}
|
||||
|
||||
/// Branch an existing timeline
|
||||
@@ -1245,15 +1191,14 @@ impl Tenant {
|
||||
|
||||
/// - run initdb to init temporary instance and get bootstrap data
|
||||
/// - after initialization complete, remove the temp dir.
|
||||
async fn bootstrap_timeline(
|
||||
fn bootstrap_timeline(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let timeline_uninit_mark = {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
self.create_timeline_uninit_mark(timeline_id, &timelines)?
|
||||
};
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timeline_uninit_mark = self.create_timeline_uninit_mark(timeline_id, &timelines)?;
|
||||
drop(timelines);
|
||||
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
||||
// temporary directory for basebackup files for the given timeline.
|
||||
let initdb_path = path_with_suffix_extension(
|
||||
@@ -1303,35 +1248,25 @@ impl Tenant {
|
||||
|
||||
let tenant_id = raw_timeline.owning_tenant.tenant_id;
|
||||
let unfinished_timeline = raw_timeline.raw_timeline()?;
|
||||
|
||||
tokio::task::block_in_place(|| {
|
||||
import_datadir::import_timeline_from_postgres_datadir(
|
||||
unfinished_timeline,
|
||||
pgdata_path,
|
||||
pgdata_lsn,
|
||||
)
|
||||
})
|
||||
import_datadir::import_timeline_from_postgres_datadir(
|
||||
unfinished_timeline,
|
||||
pgdata_path,
|
||||
pgdata_lsn,
|
||||
)
|
||||
.with_context(|| {
|
||||
format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
|
||||
// Flush loop needs to be spawned in order for checkpoint to be able to flush.
|
||||
// We want to run proper checkpoint before we mark timeline as available to outside world
|
||||
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
|
||||
unfinished_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
anyhow::bail!("failpoint before-checkpoint-new-timeline");
|
||||
});
|
||||
|
||||
unfinished_timeline
|
||||
.checkpoint(CheckpointConfig::Forced).await
|
||||
.checkpoint(CheckpointConfig::Forced)
|
||||
.with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?;
|
||||
|
||||
let timeline = {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
raw_timeline.initialize_with_lock(&mut timelines, false)?
|
||||
};
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let timeline = raw_timeline.initialize_with_lock(&mut timelines, false)?;
|
||||
drop(timelines);
|
||||
|
||||
info!(
|
||||
"created root timeline {} timeline.lsn {}",
|
||||
@@ -1371,7 +1306,7 @@ impl Tenant {
|
||||
Ok(UninitializedTimeline {
|
||||
owning_tenant: self,
|
||||
timeline_id: new_timeline_id,
|
||||
raw_timeline: Some((Arc::new(new_timeline), uninit_mark)),
|
||||
raw_timeline: Some((new_timeline, uninit_mark)),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -1490,7 +1425,7 @@ impl Tenant {
|
||||
let timeline = UninitializedTimeline {
|
||||
owning_tenant: self,
|
||||
timeline_id,
|
||||
raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())),
|
||||
raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())),
|
||||
};
|
||||
match timeline.initialize_with_lock(&mut timelines_accessor, true) {
|
||||
Ok(initialized_timeline) => {
|
||||
@@ -1511,25 +1446,6 @@ impl Tenant {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gathers inputs from all of the timelines to produce a sizing model input.
|
||||
///
|
||||
/// Future is cancellation safe. Only one calculation can be running at once per tenant.
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||
pub async fn gather_size_inputs(&self) -> anyhow::Result<size::ModelInputs> {
|
||||
let logical_sizes_at_once = self
|
||||
.conf
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.inner();
|
||||
|
||||
// TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
|
||||
// are for testing/experimenting, we tolerate this.
|
||||
//
|
||||
// See more for on the issue #2748 condenced out of the initial PR review.
|
||||
let mut shared_cache = self.cached_logical_sizes.lock().await;
|
||||
|
||||
size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||
@@ -1673,7 +1589,6 @@ pub mod harness {
|
||||
walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
|
||||
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
|
||||
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
|
||||
trace_read_requests: Some(tenant_conf.trace_read_requests),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1945,7 +1860,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
|
||||
fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut lsn = start_lsn;
|
||||
#[allow(non_snake_case)]
|
||||
{
|
||||
@@ -1966,7 +1881,7 @@ mod tests {
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
}
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
{
|
||||
let writer = tline.writer();
|
||||
writer.put(
|
||||
@@ -1983,26 +1898,24 @@ mod tests {
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
}
|
||||
tline.checkpoint(CheckpointConfig::Forced).await
|
||||
tline.checkpoint(CheckpointConfig::Forced)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
#[test]
|
||||
fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let tenant =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
|
||||
.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
// FIXME: this doesn't actually remove any layer currently, given how the checkpointing
|
||||
// and compaction works. But it does set the 'cutoff' point so that the cross check
|
||||
// below should fail.
|
||||
tenant
|
||||
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
|
||||
.await?;
|
||||
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
|
||||
// try to branch at lsn 25, should fail because we already garbage collected the data
|
||||
match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
|
||||
@@ -2047,14 +1960,14 @@ mod tests {
|
||||
/*
|
||||
// FIXME: This currently fails to error out. Calling GC doesn't currently
|
||||
// remove the old value, we'd need to work a little harder
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
|
||||
#[test]
|
||||
fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
|
||||
.load();
|
||||
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
|
||||
@@ -2067,47 +1980,43 @@ mod tests {
|
||||
}
|
||||
*/
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
|
||||
#[test]
|
||||
fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
|
||||
let tenant =
|
||||
TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
let newtline = tenant
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
tenant
|
||||
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
|
||||
.await?;
|
||||
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
#[tokio::test]
|
||||
async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
|
||||
#[test]
|
||||
fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
|
||||
let tenant =
|
||||
TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
let newtline = tenant
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
|
||||
|
||||
// run gc on parent
|
||||
tenant
|
||||
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
|
||||
.await?;
|
||||
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
|
||||
// Check that the data is still accessible on the branch.
|
||||
assert_eq!(
|
||||
@@ -2118,8 +2027,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeline_load() -> anyhow::Result<()> {
|
||||
#[test]
|
||||
fn timeline_load() -> anyhow::Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
{
|
||||
@@ -2127,8 +2036,8 @@ mod tests {
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x8000))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
}
|
||||
|
||||
let tenant = harness.load();
|
||||
@@ -2139,8 +2048,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
|
||||
#[test]
|
||||
fn timeline_load_with_ancestor() -> anyhow::Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load_with_ancestor";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
// create two timelines
|
||||
@@ -2150,8 +2059,8 @@ mod tests {
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
|
||||
@@ -2159,8 +2068,8 @@ mod tests {
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
}
|
||||
|
||||
// check that both of them are initially unloaded
|
||||
@@ -2220,8 +2129,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_images() -> anyhow::Result<()> {
|
||||
#[test]
|
||||
fn test_images() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_images")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2232,7 +2141,7 @@ mod tests {
|
||||
writer.finish_write(Lsn(0x10));
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
@@ -2240,7 +2149,7 @@ mod tests {
|
||||
writer.finish_write(Lsn(0x20));
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
@@ -2248,7 +2157,7 @@ mod tests {
|
||||
writer.finish_write(Lsn(0x30));
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
@@ -2256,7 +2165,7 @@ mod tests {
|
||||
writer.finish_write(Lsn(0x40));
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
|
||||
@@ -2272,8 +2181,8 @@ mod tests {
|
||||
// Insert 1000 key-value pairs with increasing keys, checkpoint,
|
||||
// repeat 50 times.
|
||||
//
|
||||
#[tokio::test]
|
||||
async fn test_bulk_insert() -> anyhow::Result<()> {
|
||||
#[test]
|
||||
fn test_bulk_insert() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_bulk_insert")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2306,7 +2215,7 @@ mod tests {
|
||||
let cutoff = tline.get_last_record_lsn();
|
||||
|
||||
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
tline.gc()?;
|
||||
}
|
||||
@@ -2314,8 +2223,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_random_updates() -> anyhow::Result<()> {
|
||||
#[test]
|
||||
fn test_random_updates() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_random_updates")?.load();
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2378,7 +2287,7 @@ mod tests {
|
||||
println!("checkpointing {}", lsn);
|
||||
let cutoff = tline.get_last_record_lsn();
|
||||
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
tline.gc()?;
|
||||
}
|
||||
@@ -2386,8 +2295,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_traverse_branches() -> anyhow::Result<()> {
|
||||
#[test]
|
||||
fn test_traverse_branches() -> anyhow::Result<()> {
|
||||
let tenant = TenantHarness::create("test_traverse_branches")?.load();
|
||||
let mut tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
@@ -2459,7 +2368,7 @@ mod tests {
|
||||
println!("checkpointing {}", lsn);
|
||||
let cutoff = tline.get_last_record_lsn();
|
||||
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
|
||||
tline.checkpoint(CheckpointConfig::Forced).await?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
tline.gc()?;
|
||||
}
|
||||
|
||||
@@ -74,7 +74,6 @@ where
|
||||
};
|
||||
|
||||
dstbuf.clear();
|
||||
dstbuf.reserve(len);
|
||||
|
||||
// Read the payload
|
||||
let mut remain = len;
|
||||
|
||||
@@ -260,9 +260,8 @@ impl Layer for DeltaLayer {
|
||||
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let mut cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
|
||||
let buf = cursor.read_blob(pos).with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
@@ -611,9 +610,9 @@ impl DeltaLayer {
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
struct DeltaLayerWriterInner {
|
||||
pub struct DeltaLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
pub path: PathBuf,
|
||||
path: PathBuf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
|
||||
@@ -625,17 +624,17 @@ struct DeltaLayerWriterInner {
|
||||
blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriterInner {
|
||||
impl DeltaLayerWriter {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
fn new(
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
) -> anyhow::Result<Self> {
|
||||
) -> Result<DeltaLayerWriter> {
|
||||
// Create the file initially with a temporary filename. We don't know
|
||||
// the end key yet, so we cannot form the final filename yet. We will
|
||||
// rename it when we're done.
|
||||
@@ -654,7 +653,7 @@ impl DeltaLayerWriterInner {
|
||||
let block_buf = BlockBuf::new();
|
||||
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
||||
|
||||
Ok(Self {
|
||||
Ok(DeltaLayerWriter {
|
||||
conf,
|
||||
path,
|
||||
timeline_id,
|
||||
@@ -671,17 +670,17 @@ impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||
self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
|
||||
}
|
||||
|
||||
fn put_value_bytes(
|
||||
pub fn put_value_bytes(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: &[u8],
|
||||
will_init: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<()> {
|
||||
assert!(self.lsn_range.start <= lsn);
|
||||
|
||||
let off = self.blob_writer.write_blob(val)?;
|
||||
@@ -694,14 +693,14 @@ impl DeltaLayerWriterInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
pub fn size(&self) -> u64 {
|
||||
self.blob_writer.size() + self.tree.borrow_writer().size()
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -769,102 +768,6 @@ impl DeltaLayerWriterInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new delta layer.
|
||||
///
|
||||
/// Usage:
|
||||
///
|
||||
/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
|
||||
///
|
||||
/// 2. Write the contents by calling `put_value` for every page
|
||||
/// version to store in the layer.
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
||||
/// possible for the writer to drop before `finish` is actually called. So this
|
||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||
/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
|
||||
/// implementation that cleans up the temporary file in failure. It's not
|
||||
/// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves
|
||||
/// out some fields, making it impossible to implement `Drop`.
|
||||
///
|
||||
#[must_use]
|
||||
pub struct DeltaLayerWriter {
|
||||
inner: Option<DeltaLayerWriterInner>,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriter {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
inner: Some(DeltaLayerWriterInner::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_start,
|
||||
lsn_range,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Append a key-value pair to the file.
|
||||
///
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_value(key, lsn, val)
|
||||
}
|
||||
|
||||
pub fn put_value_bytes(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: &[u8],
|
||||
will_init: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
self.inner
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.put_value_bytes(key, lsn, val, will_init)
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
self.inner.as_ref().unwrap().size()
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
self.inner.take().unwrap().finish(key_end)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DeltaLayerWriter {
|
||||
fn drop(&mut self) {
|
||||
if let Some(inner) = self.inner.take() {
|
||||
match inner.blob_writer.into_inner().into_inner() {
|
||||
Ok(vfile) => vfile.remove(),
|
||||
Err(err) => warn!(
|
||||
"error while flushing buffer of image layer temporary file: {}",
|
||||
err
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator over all key-value pairse stored in a delta layer
|
||||
///
|
||||
|
||||
@@ -411,7 +411,7 @@ impl ImageLayer {
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
struct ImageLayerWriterInner {
|
||||
pub struct ImageLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
timeline_id: TimelineId,
|
||||
@@ -423,17 +423,14 @@ struct ImageLayerWriterInner {
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
}
|
||||
|
||||
impl ImageLayerWriterInner {
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
fn new(
|
||||
impl ImageLayerWriter {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<Self> {
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
// Create the file initially with a temporary filename.
|
||||
// We'll atomically rename it to the final name when we're done.
|
||||
let path = ImageLayer::temp_path_for(
|
||||
@@ -458,7 +455,7 @@ impl ImageLayerWriterInner {
|
||||
let block_buf = BlockBuf::new();
|
||||
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
||||
|
||||
let writer = Self {
|
||||
let writer = ImageLayerWriter {
|
||||
conf,
|
||||
path,
|
||||
timeline_id,
|
||||
@@ -477,7 +474,7 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
||||
pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
let off = self.blob_writer.write_blob(img)?;
|
||||
|
||||
@@ -488,10 +485,7 @@ impl ImageLayerWriterInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
fn finish(self) -> anyhow::Result<ImageLayer> {
|
||||
pub fn finish(self) -> anyhow::Result<ImageLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -558,76 +552,3 @@ impl ImageLayerWriterInner {
|
||||
Ok(layer)
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new image layer.
|
||||
///
|
||||
/// Usage:
|
||||
///
|
||||
/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
|
||||
///
|
||||
/// 2. Write the contents by calling `put_page_image` for every key-value
|
||||
/// pair in the key range.
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
||||
/// possible for the writer to drop before `finish` is actually called. So this
|
||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||
/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
|
||||
/// implementation that cleans up the temporary file in failure. It's not
|
||||
/// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
|
||||
/// out some fields, making it impossible to implement `Drop`.
|
||||
///
|
||||
#[must_use]
|
||||
pub struct ImageLayerWriter {
|
||||
inner: Option<ImageLayerWriterInner>,
|
||||
}
|
||||
|
||||
impl ImageLayerWriter {
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
Ok(Self {
|
||||
inner: Some(ImageLayerWriterInner::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_range,
|
||||
lsn,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Write next value to the file.
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_image(key, img)
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
|
||||
self.inner.take().unwrap().finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ImageLayerWriter {
|
||||
fn drop(&mut self) {
|
||||
if let Some(inner) = self.inner.take() {
|
||||
inner.blob_writer.into_inner().remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,475 +0,0 @@
|
||||
use std::cmp;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
use super::Tenant;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use tracing::*;
|
||||
|
||||
/// Inputs to the actual tenant sizing model
|
||||
///
|
||||
/// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to
|
||||
/// be a transferrable format between execution environments and developer.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, serde::Serialize, serde::Deserialize)]
|
||||
pub struct ModelInputs {
|
||||
updates: Vec<Update>,
|
||||
retention_period: u64,
|
||||
#[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
|
||||
timeline_inputs: HashMap<TimelineId, TimelineInputs>,
|
||||
}
|
||||
|
||||
/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
|
||||
/// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, serde::Serialize, serde::Deserialize)]
|
||||
struct TimelineInputs {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
last_record: Lsn,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
latest_gc_cutoff: Lsn,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
horizon_cutoff: Lsn,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
pitr_cutoff: Lsn,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
next_gc_cutoff: Lsn,
|
||||
}
|
||||
|
||||
/// Gathers the inputs for the tenant sizing model.
|
||||
///
|
||||
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
|
||||
/// is updated on-demand, during the start of this calculation and separate from the
|
||||
/// [`Timeline::latest_gc_cutoff`].
|
||||
///
|
||||
/// For timelines in general:
|
||||
///
|
||||
/// ```ignore
|
||||
/// 0-----|---------|----|------------| · · · · · |·> lsn
|
||||
/// initdb_lsn branchpoints* next_gc_cutoff latest
|
||||
/// ```
|
||||
///
|
||||
/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
|
||||
/// tenant size will be zero.
|
||||
pub(super) async fn gather_inputs(
|
||||
tenant: &Tenant,
|
||||
limit: &Arc<Semaphore>,
|
||||
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
||||
) -> anyhow::Result<ModelInputs> {
|
||||
// with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
|
||||
// our advantage with `?` error handling.
|
||||
let mut joinset = tokio::task::JoinSet::new();
|
||||
|
||||
let timelines = tenant
|
||||
.refresh_gc_info()
|
||||
.context("Failed to refresh gc_info before gathering inputs")?;
|
||||
|
||||
if timelines.is_empty() {
|
||||
// All timelines are below tenant's gc_horizon; alternative would be to use
|
||||
// Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
|
||||
// missing GcInfo::retain_lsns or having obsolete values for cutoff's.
|
||||
return Ok(ModelInputs {
|
||||
updates: vec![],
|
||||
retention_period: 0,
|
||||
timeline_inputs: HashMap::new(),
|
||||
});
|
||||
}
|
||||
|
||||
// record the used/inserted cache keys here, to remove extras not to start leaking
|
||||
// after initial run the cache should be quite stable, but live timelines will eventually
|
||||
// require new lsns to be inspected.
|
||||
let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new();
|
||||
|
||||
let mut updates = Vec::new();
|
||||
|
||||
// record the per timline values used to determine `retention_period`
|
||||
let mut timeline_inputs = HashMap::with_capacity(timelines.len());
|
||||
|
||||
// used to determine the `retention_period` for the size model
|
||||
let mut max_cutoff_distance = None;
|
||||
|
||||
// this will probably conflict with on-demand downloaded layers, or at least force them all
|
||||
// to be downloaded
|
||||
for timeline in timelines {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
|
||||
let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
|
||||
// there's a race between the update (holding tenant.gc_lock) and this read but it
|
||||
// might not be an issue, because it's not for Timeline::gc
|
||||
let gc_info = timeline.gc_info.read().unwrap();
|
||||
|
||||
// similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
|
||||
// new gc run, which we have no control over. however differently from `Timeline::gc`
|
||||
// we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
|
||||
// actually removing files.
|
||||
let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
|
||||
|
||||
// the minimum where we should find the next_gc_cutoff for our calculations.
|
||||
//
|
||||
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
|
||||
// want to query any logical size before initdb_lsn.
|
||||
let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn);
|
||||
|
||||
let maybe_cutoff = if next_gc_cutoff > cutoff_minimum {
|
||||
Some((next_gc_cutoff, LsnKind::GcCutOff))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// this assumes there are no other lsns than the branchpoints
|
||||
let lsns = gc_info
|
||||
.retain_lsns
|
||||
.iter()
|
||||
.inspect(|&&lsn| {
|
||||
trace!(
|
||||
timeline_id=%timeline.timeline_id,
|
||||
"retained lsn: {lsn:?}, is_before_ancestor_lsn={}",
|
||||
lsn < timeline.get_ancestor_lsn()
|
||||
)
|
||||
})
|
||||
.filter(|&&lsn| lsn > timeline.get_ancestor_lsn())
|
||||
.copied()
|
||||
.map(|lsn| (lsn, LsnKind::BranchPoint))
|
||||
.chain(maybe_cutoff)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
(
|
||||
lsns,
|
||||
gc_info.horizon_cutoff,
|
||||
gc_info.pitr_cutoff,
|
||||
next_gc_cutoff,
|
||||
)
|
||||
};
|
||||
|
||||
// update this to have a retention_period later for the tenant_size_model
|
||||
// tenant_size_model compares this to the last segments start_lsn
|
||||
if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) {
|
||||
match max_cutoff_distance.as_mut() {
|
||||
Some(max) => {
|
||||
*max = std::cmp::max(*max, cutoff_distance);
|
||||
}
|
||||
_ => {
|
||||
max_cutoff_distance = Some(cutoff_distance);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// all timelines branch from something, because it might be impossible to pinpoint
|
||||
// which is the tenant_size_model's "default" branch.
|
||||
updates.push(Update {
|
||||
lsn: timeline.get_ancestor_lsn(),
|
||||
command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
|
||||
timeline_id: timeline.timeline_id,
|
||||
});
|
||||
|
||||
for (lsn, _kind) in &interesting_lsns {
|
||||
if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
|
||||
updates.push(Update {
|
||||
lsn: *lsn,
|
||||
timeline_id: timeline.timeline_id,
|
||||
command: Command::Update(*size),
|
||||
});
|
||||
|
||||
needed_cache.insert((timeline.timeline_id, *lsn));
|
||||
} else {
|
||||
let timeline = Arc::clone(&timeline);
|
||||
let parallel_size_calcs = Arc::clone(limit);
|
||||
joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn));
|
||||
}
|
||||
}
|
||||
|
||||
timeline_inputs.insert(
|
||||
timeline.timeline_id,
|
||||
TimelineInputs {
|
||||
last_record: last_record_lsn,
|
||||
// this is not used above, because it might not have updated recently enough
|
||||
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
next_gc_cutoff,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
let mut have_any_error = false;
|
||||
|
||||
while let Some(res) = joinset.join_next().await {
|
||||
// each of these come with Result<Result<_, JoinError>, JoinError>
|
||||
// because of spawn + spawn_blocking
|
||||
let res = res.and_then(|inner| inner);
|
||||
match res {
|
||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
|
||||
debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
|
||||
|
||||
logical_size_cache.insert((timeline.timeline_id, lsn), size);
|
||||
needed_cache.insert((timeline.timeline_id, lsn));
|
||||
|
||||
updates.push(Update {
|
||||
lsn,
|
||||
timeline_id: timeline.timeline_id,
|
||||
command: Command::Update(size),
|
||||
});
|
||||
}
|
||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
|
||||
warn!(
|
||||
timeline_id=%timeline.timeline_id,
|
||||
"failed to calculate logical size at {lsn}: {error:#}"
|
||||
);
|
||||
have_any_error = true;
|
||||
}
|
||||
Err(join_error) if join_error.is_cancelled() => {
|
||||
unreachable!("we are not cancelling any of the futures, nor should be");
|
||||
}
|
||||
Err(join_error) => {
|
||||
// cannot really do anything, as this panic is likely a bug
|
||||
error!("logical size query panicked: {join_error:#}");
|
||||
have_any_error = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// prune any keys not needed anymore; we record every used key and added key.
|
||||
logical_size_cache.retain(|key, _| needed_cache.contains(key));
|
||||
|
||||
if have_any_error {
|
||||
// we cannot complete this round, because we are missing data.
|
||||
// we have however cached all we were able to request calculation on.
|
||||
anyhow::bail!("failed to calculate some logical_sizes");
|
||||
}
|
||||
|
||||
// the data gathered to updates is per lsn, regardless of the branch, so we can use it to
|
||||
// our advantage, not requiring a sorted container or graph walk.
|
||||
//
|
||||
// for branch points, which come as multiple updates at the same LSN, the Command::Update
|
||||
// is needed before a branch is made out of that branch Command::BranchFrom. this is
|
||||
// handled by the variant order in `Command`.
|
||||
updates.sort_unstable();
|
||||
|
||||
let retention_period = match max_cutoff_distance {
|
||||
Some(max) => max.0,
|
||||
None => {
|
||||
anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0")
|
||||
}
|
||||
};
|
||||
|
||||
Ok(ModelInputs {
|
||||
updates,
|
||||
retention_period,
|
||||
timeline_inputs,
|
||||
})
|
||||
}
|
||||
|
||||
impl ModelInputs {
|
||||
pub fn calculate(&self) -> anyhow::Result<u64> {
|
||||
// Option<TimelineId> is used for "naming" the branches because it is assumed to be
|
||||
// impossible to always determine the a one main branch.
|
||||
let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);
|
||||
|
||||
// tracking these not to require modifying the current implementation of the size model,
|
||||
// which works in relative LSNs and sizes.
|
||||
let mut last_state: HashMap<TimelineId, (Lsn, u64)> = HashMap::new();
|
||||
|
||||
for update in &self.updates {
|
||||
let Update {
|
||||
lsn,
|
||||
command: op,
|
||||
timeline_id,
|
||||
} = update;
|
||||
match op {
|
||||
Command::Update(sz) => {
|
||||
let latest = last_state.get_mut(timeline_id).ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"ordering-mismatch: there must had been a previous state for {timeline_id}"
|
||||
)
|
||||
})?;
|
||||
|
||||
let lsn_bytes = {
|
||||
let Lsn(now) = lsn;
|
||||
let Lsn(prev) = latest.0;
|
||||
debug_assert!(prev <= *now, "self.updates should had been sorted");
|
||||
now - prev
|
||||
};
|
||||
|
||||
let size_diff =
|
||||
i64::try_from(*sz as i128 - latest.1 as i128).with_context(|| {
|
||||
format!("size difference i64 overflow for {timeline_id}")
|
||||
})?;
|
||||
|
||||
storage.modify_branch(&Some(*timeline_id), "".into(), lsn_bytes, size_diff);
|
||||
*latest = (*lsn, *sz);
|
||||
}
|
||||
Command::BranchFrom(parent) => {
|
||||
storage.branch(parent, Some(*timeline_id));
|
||||
|
||||
let size = parent
|
||||
.as_ref()
|
||||
.and_then(|id| last_state.get(id))
|
||||
.map(|x| x.1)
|
||||
.unwrap_or(0);
|
||||
last_state.insert(*timeline_id, (*lsn, size));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(storage.calculate(self.retention_period).total_children())
|
||||
}
|
||||
}
|
||||
|
||||
/// Single size model update.
|
||||
///
|
||||
/// Sizing model works with relative increments over latest branch state.
|
||||
/// Updates are absolute, so additional state needs to be tracked when applying.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(
|
||||
Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
|
||||
)]
|
||||
struct Update {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
lsn: utils::lsn::Lsn,
|
||||
command: Command,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
enum Command {
|
||||
Update(u64),
|
||||
BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Command {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3
|
||||
// linebreaks
|
||||
match self {
|
||||
Self::Update(arg0) => write!(f, "Update({arg0})"),
|
||||
Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum LsnKind {
|
||||
BranchPoint,
|
||||
GcCutOff,
|
||||
}
|
||||
|
||||
/// Newtype around the tuple that carries the timeline at lsn logical size calculation.
|
||||
struct TimelineAtLsnSizeResult(
|
||||
Arc<crate::tenant::Timeline>,
|
||||
utils::lsn::Lsn,
|
||||
anyhow::Result<u64>,
|
||||
);
|
||||
|
||||
#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
|
||||
async fn calculate_logical_size(
|
||||
limit: Arc<tokio::sync::Semaphore>,
|
||||
timeline: Arc<crate::tenant::Timeline>,
|
||||
lsn: utils::lsn::Lsn,
|
||||
) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
|
||||
let permit = tokio::sync::Semaphore::acquire_owned(limit)
|
||||
.await
|
||||
.expect("global semaphore should not had been closed");
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _permit = permit;
|
||||
let size_res = timeline.calculate_logical_size(lsn);
|
||||
TimelineAtLsnSizeResult(timeline, lsn, size_res)
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn updates_sort() {
|
||||
use std::str::FromStr;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
let ids = [
|
||||
TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(),
|
||||
TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(),
|
||||
TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(),
|
||||
];
|
||||
|
||||
// try through all permutations
|
||||
let ids = [
|
||||
[&ids[0], &ids[1], &ids[2]],
|
||||
[&ids[0], &ids[2], &ids[1]],
|
||||
[&ids[1], &ids[0], &ids[2]],
|
||||
[&ids[1], &ids[2], &ids[0]],
|
||||
[&ids[2], &ids[0], &ids[1]],
|
||||
[&ids[2], &ids[1], &ids[0]],
|
||||
];
|
||||
|
||||
for ids in ids {
|
||||
// apply a fixture which uses a permutation of ids
|
||||
let commands = [
|
||||
Update {
|
||||
lsn: Lsn(0),
|
||||
command: Command::BranchFrom(None),
|
||||
timeline_id: *ids[0],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/67E7618").unwrap(),
|
||||
command: Command::Update(43696128),
|
||||
timeline_id: *ids[0],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/67E7618").unwrap(),
|
||||
command: Command::BranchFrom(Some(*ids[0])),
|
||||
timeline_id: *ids[1],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/76BE4F0").unwrap(),
|
||||
command: Command::Update(41844736),
|
||||
timeline_id: *ids[1],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/10E49380").unwrap(),
|
||||
command: Command::Update(42164224),
|
||||
timeline_id: *ids[0],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/10E49380").unwrap(),
|
||||
command: Command::BranchFrom(Some(*ids[0])),
|
||||
timeline_id: *ids[2],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/11D74910").unwrap(),
|
||||
command: Command::Update(42172416),
|
||||
timeline_id: *ids[2],
|
||||
},
|
||||
Update {
|
||||
lsn: Lsn::from_str("0/12051E98").unwrap(),
|
||||
command: Command::Update(42196992),
|
||||
timeline_id: *ids[0],
|
||||
},
|
||||
];
|
||||
|
||||
let mut sorted = commands;
|
||||
|
||||
// these must sort in the same order, regardless of how the ids sort
|
||||
// which is why the timeline_id is the last field
|
||||
sorted.sort_unstable();
|
||||
|
||||
assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_size_for_multiple_branches() {
|
||||
// this is generated from integration test test_tenant_size_with_multiple_branches, but this way
|
||||
// it has the stable lsn's
|
||||
let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
|
||||
|
||||
let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
|
||||
|
||||
assert_eq!(inputs.calculate().unwrap(), 36_409_872);
|
||||
}
|
||||
@@ -16,7 +16,7 @@ use std::fs;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use crate::tenant::{
|
||||
@@ -121,16 +121,8 @@ pub struct Timeline {
|
||||
/// to avoid deadlock.
|
||||
write_lock: Mutex<()>,
|
||||
|
||||
/// Used to avoid multiple `flush_loop` tasks running
|
||||
flush_loop_started: Mutex<bool>,
|
||||
|
||||
/// layer_flush_start_tx can be used to wake up the layer-flushing task.
|
||||
/// The value is a counter, incremented every time a new flush cycle is requested.
|
||||
/// The flush cycle counter is sent back on the layer_flush_done channel when
|
||||
/// the flush finishes. You can use that to wait for the flush to finish.
|
||||
layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
|
||||
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
|
||||
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
|
||||
/// Used to ensure that there is only task performing flushing at a time
|
||||
layer_flush_lock: Mutex<()>,
|
||||
|
||||
/// Layer removal lock.
|
||||
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
||||
@@ -280,11 +272,6 @@ impl LogicalSize {
|
||||
self.size_added_after_initial
|
||||
.fetch_add(delta, AtomicOrdering::SeqCst);
|
||||
}
|
||||
|
||||
/// Returns the initialized (already calculated) value, if any.
|
||||
fn initialized_size(&self) -> Option<u64> {
|
||||
self.initial_logical_size.get().copied()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WalReceiverInfo {
|
||||
@@ -474,16 +461,15 @@ impl Timeline {
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
|
||||
pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
|
||||
pub fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
|
||||
match cconf {
|
||||
CheckpointConfig::Flush => {
|
||||
self.freeze_inmem_layer(false);
|
||||
self.flush_frozen_layers_and_wait().await
|
||||
self.flush_frozen_layers(true)
|
||||
}
|
||||
CheckpointConfig::Forced => {
|
||||
self.freeze_inmem_layer(false);
|
||||
self.flush_frozen_layers_and_wait().await?;
|
||||
self.flush_frozen_layers(true)?;
|
||||
self.compact()
|
||||
}
|
||||
}
|
||||
@@ -633,8 +619,24 @@ impl Timeline {
|
||||
self.last_freeze_at.store(last_lsn);
|
||||
*(self.last_freeze_ts.write().unwrap()) = Instant::now();
|
||||
|
||||
// Wake up the layer flusher
|
||||
self.flush_frozen_layers();
|
||||
// Launch a task to flush the frozen layer to disk, unless
|
||||
// a task was already running. (If the task was running
|
||||
// at the time that we froze the layer, it must've seen the
|
||||
// the layer we just froze before it exited; see comments
|
||||
// in flush_frozen_layers())
|
||||
if let Ok(guard) = self.layer_flush_lock.try_lock() {
|
||||
drop(guard);
|
||||
let self_clone = Arc::clone(self);
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::LayerFlushTask,
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
"layer flush task",
|
||||
false,
|
||||
async move { self_clone.flush_frozen_layers(false) },
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -725,9 +727,6 @@ impl Timeline {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
let (state, _) = watch::channel(TimelineState::Suspended);
|
||||
|
||||
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
|
||||
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
|
||||
|
||||
let mut result = Timeline {
|
||||
conf,
|
||||
tenant_conf,
|
||||
@@ -755,12 +754,8 @@ impl Timeline {
|
||||
|
||||
upload_layers: AtomicBool::new(upload_layers),
|
||||
|
||||
flush_loop_started: Mutex::new(false),
|
||||
|
||||
layer_flush_start_tx,
|
||||
layer_flush_done_tx,
|
||||
|
||||
write_lock: Mutex::new(()),
|
||||
layer_flush_lock: Mutex::new(()),
|
||||
layer_removal_cs: Mutex::new(()),
|
||||
|
||||
gc_info: RwLock::new(GcInfo {
|
||||
@@ -793,33 +788,6 @@ impl Timeline {
|
||||
result
|
||||
}
|
||||
|
||||
pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
|
||||
let mut flush_loop_started = self.flush_loop_started.lock().unwrap();
|
||||
if *flush_loop_started {
|
||||
info!(
|
||||
"skipping attempt to start flush_loop twice {}/{}",
|
||||
self.tenant_id, self.timeline_id
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
|
||||
let self_clone = Arc::clone(self);
|
||||
info!("spawning flush loop");
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::LayerFlushTask,
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
"layer flush task",
|
||||
false,
|
||||
async move { self_clone.flush_loop(layer_flush_start_rx).await; Ok(()) }
|
||||
.instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
|
||||
);
|
||||
|
||||
*flush_loop_started = true;
|
||||
}
|
||||
|
||||
pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
|
||||
if !is_etcd_client_initialized() {
|
||||
if cfg!(test) {
|
||||
@@ -1011,26 +979,9 @@ impl Timeline {
|
||||
/// Calculate the logical size of the database at the latest LSN.
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors, this can be a slow operation.
|
||||
pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
|
||||
info!(
|
||||
"Calculating logical size for timeline {} at {}",
|
||||
self.timeline_id, up_to_lsn
|
||||
);
|
||||
let timer = if up_to_lsn == self.initdb_lsn {
|
||||
if let Some(size) = self.current_logical_size.initialized_size() {
|
||||
if size != 0 {
|
||||
// non-zero size means that the size has already been calculated by this method
|
||||
// after startup. if the logical size is for a new timeline without layers the
|
||||
// size will be zero, and we cannot use that, or this caching strategy until
|
||||
// pageserver restart.
|
||||
return Ok(size);
|
||||
}
|
||||
}
|
||||
|
||||
self.metrics.init_logical_size_histo.start_timer()
|
||||
} else {
|
||||
self.metrics.logical_size_histo.start_timer()
|
||||
};
|
||||
fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
|
||||
info!("Calculating logical size for timeline {}", self.timeline_id);
|
||||
let timer = self.metrics.init_logical_size_histo.start_timer();
|
||||
let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?;
|
||||
debug!("calculated logical size: {logical_size}");
|
||||
timer.stop_and_record();
|
||||
@@ -1316,94 +1267,53 @@ impl Timeline {
|
||||
drop(layers);
|
||||
}
|
||||
|
||||
/// Layer flusher task's main loop.
|
||||
async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>) {
|
||||
info!("started flush loop");
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("shutting down layer flush task");
|
||||
break;
|
||||
},
|
||||
_ = layer_flush_start_rx.changed() => {}
|
||||
/// Flush all frozen layers to disk.
|
||||
///
|
||||
/// Only one task at a time can be doing layer-flushing for a
|
||||
/// given timeline. If 'wait' is true, and another task is
|
||||
/// currently doing the flushing, this function will wait for it
|
||||
/// to finish. If 'wait' is false, this function will return
|
||||
/// immediately instead.
|
||||
fn flush_frozen_layers(&self, wait: bool) -> anyhow::Result<()> {
|
||||
let flush_lock_guard = if wait {
|
||||
self.layer_flush_lock.lock().unwrap()
|
||||
} else {
|
||||
match self.layer_flush_lock.try_lock() {
|
||||
Ok(guard) => guard,
|
||||
Err(TryLockError::WouldBlock) => return Ok(()),
|
||||
Err(TryLockError::Poisoned(err)) => panic!("{:?}", err),
|
||||
}
|
||||
};
|
||||
|
||||
trace!("waking up");
|
||||
let timer = self.metrics.flush_time_histo.start_timer();
|
||||
let flush_counter = *layer_flush_start_rx.borrow();
|
||||
let result = loop {
|
||||
let layer_to_flush = {
|
||||
let layers = self.layers.read().unwrap();
|
||||
layers.frozen_layers.front().cloned()
|
||||
// drop 'layers' lock to allow concurrent reads and writes
|
||||
};
|
||||
if let Some(layer_to_flush) = layer_to_flush {
|
||||
if let Err(err) = self.flush_frozen_layer(layer_to_flush).await {
|
||||
error!("could not flush frozen layer: {err:?}");
|
||||
break Err(err);
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
break Ok(());
|
||||
}
|
||||
};
|
||||
// Notify any listeners that we're done
|
||||
let _ = self
|
||||
.layer_flush_done_tx
|
||||
.send_replace((flush_counter, result));
|
||||
|
||||
timer.stop_and_record();
|
||||
}
|
||||
}
|
||||
|
||||
async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
|
||||
let mut rx = self.layer_flush_done_tx.subscribe();
|
||||
|
||||
// Increment the flush cycle counter and wake up the flush task.
|
||||
// Remember the new value, so that when we listen for the flush
|
||||
// to finish, we know when the flush that we initiated has
|
||||
// finished, instead of some other flush that was started earlier.
|
||||
let mut my_flush_request = 0;
|
||||
|
||||
if !&*self.flush_loop_started.lock().unwrap() {
|
||||
anyhow::bail!("cannot flush frozen layers when flush_loop is not running")
|
||||
}
|
||||
|
||||
self.layer_flush_start_tx.send_modify(|counter| {
|
||||
my_flush_request = *counter + 1;
|
||||
*counter = my_flush_request;
|
||||
});
|
||||
let timer = self.metrics.flush_time_histo.start_timer();
|
||||
|
||||
loop {
|
||||
{
|
||||
let (last_result_counter, last_result) = &*rx.borrow();
|
||||
if *last_result_counter >= my_flush_request {
|
||||
if let Err(_err) = last_result {
|
||||
// We already logged the original error in
|
||||
// flush_loop. We cannot propagate it to the caller
|
||||
// here, because it might not be Cloneable
|
||||
anyhow::bail!(
|
||||
"Could not flush frozen layer. Request id: {}",
|
||||
my_flush_request
|
||||
);
|
||||
} else {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
let layers = self.layers.read().unwrap();
|
||||
if let Some(frozen_layer) = layers.frozen_layers.front() {
|
||||
let frozen_layer = Arc::clone(frozen_layer);
|
||||
drop(layers); // to allow concurrent reads and writes
|
||||
self.flush_frozen_layer(frozen_layer)?;
|
||||
} else {
|
||||
// Drop the 'layer_flush_lock' *before* 'layers'. That
|
||||
// way, if you freeze a layer, and then call
|
||||
// flush_frozen_layers(false), it is guaranteed that
|
||||
// if another thread was busy flushing layers and the
|
||||
// call therefore returns immediately, the other
|
||||
// thread will have seen the newly-frozen layer and
|
||||
// will flush that too (assuming no errors).
|
||||
drop(flush_lock_guard);
|
||||
drop(layers);
|
||||
break;
|
||||
}
|
||||
trace!("waiting for flush to complete");
|
||||
rx.changed().await?;
|
||||
trace!("done")
|
||||
}
|
||||
}
|
||||
|
||||
fn flush_frozen_layers(&self) {
|
||||
self.layer_flush_start_tx.send_modify(|val| *val += 1);
|
||||
timer.stop_and_record();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||
#[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.filename().display()))]
|
||||
async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
|
||||
fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
|
||||
// As a special case, when we have just imported an image into the repository,
|
||||
// instead of writing out a L0 delta layer, we directly write out image layer
|
||||
// files instead. This is possible as long as *all* the data imported into the
|
||||
@@ -1631,10 +1541,6 @@ impl Timeline {
|
||||
lsn,
|
||||
)?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
anyhow::bail!("failpoint image-layer-writer-fail-before-finish");
|
||||
});
|
||||
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
@@ -1929,11 +1835,6 @@ impl Timeline {
|
||||
},
|
||||
)?);
|
||||
}
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
}
|
||||
@@ -2333,10 +2234,13 @@ impl Timeline {
|
||||
|
||||
let last_rec_lsn = data.records.last().unwrap().0;
|
||||
|
||||
let img = self
|
||||
.walredo_mgr
|
||||
.request_redo(key, request_lsn, base_img, data.records, self.pg_version)
|
||||
.context("Failed to reconstruct a page image:")?;
|
||||
let img = self.walredo_mgr.request_redo(
|
||||
key,
|
||||
request_lsn,
|
||||
base_img,
|
||||
data.records,
|
||||
self.pg_version,
|
||||
)?;
|
||||
|
||||
if img.len() == page_cache::PAGE_SZ {
|
||||
let cache = page_cache::get();
|
||||
|
||||
@@ -82,7 +82,6 @@ pub struct TenantConf {
|
||||
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
|
||||
/// to avoid eager reconnects.
|
||||
pub max_lsn_wal_lag: NonZeroU64,
|
||||
pub trace_read_requests: bool,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -106,7 +105,6 @@ pub struct TenantConfOpt {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub lagging_wal_timeout: Option<Duration>,
|
||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
pub trace_read_requests: Option<bool>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -140,9 +138,6 @@ impl TenantConfOpt {
|
||||
.lagging_wal_timeout
|
||||
.unwrap_or(global_conf.lagging_wal_timeout),
|
||||
max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
|
||||
trace_read_requests: self
|
||||
.trace_read_requests
|
||||
.unwrap_or(global_conf.trace_read_requests),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,7 +207,6 @@ impl TenantConf {
|
||||
.expect("cannot parse default walreceiver lagging wal timeout"),
|
||||
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
||||
trace_read_requests: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -238,7 +232,6 @@ impl TenantConf {
|
||||
.unwrap(),
|
||||
max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||
.unwrap(),
|
||||
trace_read_requests: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -241,7 +241,7 @@ pub async fn shutdown_all_tenants() {
|
||||
let tenant_id = tenant.tenant_id();
|
||||
debug!("shutdown tenant {tenant_id}");
|
||||
|
||||
if let Err(err) = tenant.checkpoint().await {
|
||||
if let Err(err) = tenant.checkpoint() {
|
||||
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let mut sleep_duration = gc_period;
|
||||
if gc_horizon > 0 {
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false)
|
||||
{
|
||||
sleep_duration = wait_duration;
|
||||
error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
use bytes::Bytes;
|
||||
use std::{
|
||||
fs::{create_dir_all, File},
|
||||
io::{BufWriter, Write},
|
||||
path::PathBuf,
|
||||
};
|
||||
|
||||
pub struct Tracer {
|
||||
writer: BufWriter<File>,
|
||||
}
|
||||
|
||||
impl Drop for Tracer {
|
||||
fn drop(&mut self) {
|
||||
self.flush()
|
||||
}
|
||||
}
|
||||
|
||||
impl Tracer {
|
||||
pub fn new(path: PathBuf) -> Self {
|
||||
let parent = path.parent().expect("failed to parse parent path");
|
||||
create_dir_all(parent).expect("failed to create trace dir");
|
||||
|
||||
let file = File::create(path).expect("failed to create trace file");
|
||||
Tracer {
|
||||
writer: BufWriter::new(file),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn trace(&mut self, msg: &Bytes) {
|
||||
self.writer.write_all(msg).expect("failed to write trace");
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) {
|
||||
self.writer.flush().expect("failed to flush trace file");
|
||||
}
|
||||
}
|
||||
@@ -319,12 +319,6 @@ impl VirtualFile {
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn remove(self) {
|
||||
let path = self.path.clone();
|
||||
drop(self);
|
||||
std::fs::remove_file(path).expect("failed to remove the virtual file");
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for VirtualFile {
|
||||
|
||||
@@ -155,19 +155,22 @@ impl<E: Clone> TaskHandle<E> {
|
||||
|
||||
/// Aborts current task, waiting for it to finish.
|
||||
pub async fn shutdown(self) {
|
||||
if let Some(jh) = self.join_handle {
|
||||
self.cancellation.send(()).ok();
|
||||
match jh.await {
|
||||
Ok(Ok(())) => debug!("Shutdown success"),
|
||||
Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
|
||||
Err(join_error) => {
|
||||
if join_error.is_cancelled() {
|
||||
error!("Shutdown task was cancelled");
|
||||
} else {
|
||||
error!("Shutdown task join error: {join_error}")
|
||||
match self.join_handle {
|
||||
Some(jh) => {
|
||||
self.cancellation.send(()).ok();
|
||||
match jh.await {
|
||||
Ok(Ok(())) => debug!("Shutdown success"),
|
||||
Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
|
||||
Err(join_error) => {
|
||||
if join_error.is_cancelled() {
|
||||
error!("Shutdown task was cancelled");
|
||||
} else {
|
||||
error!("Shutdown task join error: {join_error}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,7 +93,7 @@ pub fn spawn_connection_manager_task(
|
||||
}
|
||||
}
|
||||
.instrument(
|
||||
info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
|
||||
info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
|
||||
),
|
||||
);
|
||||
}
|
||||
@@ -836,20 +836,15 @@ fn wal_stream_connection_string(
|
||||
listen_pg_addr_str: &str,
|
||||
) -> anyhow::Result<String> {
|
||||
let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db");
|
||||
sk_connstr
|
||||
.parse()
|
||||
.context("bad url")
|
||||
.and_then(|url: url::Url| {
|
||||
let host = url.host_str().context("host is missing")?;
|
||||
let port = url.port().unwrap_or(5432); // default PG port
|
||||
|
||||
Ok(format!(
|
||||
"host={host} \
|
||||
port={port} \
|
||||
options='-c timeline_id={timeline_id} tenant_id={tenant_id}'"
|
||||
))
|
||||
})
|
||||
.with_context(|| format!("Failed to parse pageserver connection URL '{sk_connstr}'"))
|
||||
let me_conf = sk_connstr
|
||||
.parse::<postgres::config::Config>()
|
||||
.with_context(|| {
|
||||
format!("Failed to parse pageserver connection string '{sk_connstr}' as a postgres one")
|
||||
})?;
|
||||
let (host, port) = utils::connstring::connection_host_port(&me_conf);
|
||||
Ok(format!(
|
||||
"host={host} port={port} options='-c timeline_id={timeline_id} tenant_id={tenant_id}'"
|
||||
))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -897,7 +892,7 @@ mod tests {
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("no_commit_lsn".to_string()),
|
||||
safekeeper_connstr: Some("no commit_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
@@ -914,7 +909,7 @@ mod tests {
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
safekeeper_connstr: Some("no_commit_lsn".to_string()),
|
||||
safekeeper_connstr: Some("no commit_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
@@ -1010,7 +1005,7 @@ mod tests {
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("not_advanced_lsn".to_string()),
|
||||
safekeeper_connstr: Some("not advanced Lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
@@ -1028,7 +1023,7 @@ mod tests {
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()),
|
||||
safekeeper_connstr: Some("not enough advanced Lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
@@ -1098,7 +1093,7 @@ mod tests {
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("smaller_commit_lsn".to_string()),
|
||||
safekeeper_connstr: Some("smaller commit_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
@@ -1288,7 +1283,7 @@ mod tests {
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()),
|
||||
safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
@@ -1312,7 +1307,7 @@ mod tests {
|
||||
);
|
||||
assert!(over_threshcurrent_candidate
|
||||
.wal_source_connstr
|
||||
.contains("advanced_by_lsn_safekeeper"));
|
||||
.contains("advanced by Lsn safekeeper"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -31,8 +31,8 @@ use crate::{
|
||||
walrecord::DecodedWALRecord,
|
||||
};
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use pq_proto::ReplicationFeedback;
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::{lsn::Lsn, pq_proto::ReplicationFeedback};
|
||||
|
||||
/// Status of the connection.
|
||||
#[derive(Debug, Clone)]
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
//! process. Then we get the page image back. Communication with the
|
||||
//! postgres process happens via stdin/stdout
|
||||
//!
|
||||
//! See pgxn/neon_walredo/walredoproc.c for the other side of
|
||||
//! See src/backend/tcop/zenith_wal_redo.c for the other side of
|
||||
//! this communication.
|
||||
//!
|
||||
//! The Postgres process is assumed to be secure against malicious WAL
|
||||
@@ -22,10 +22,10 @@ use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use nix::poll::*;
|
||||
use serde::Serialize;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use std::os::unix::prelude::CommandExt;
|
||||
use std::path::PathBuf;
|
||||
@@ -34,7 +34,6 @@ use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use std::{fs, io};
|
||||
use tracing::*;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
|
||||
@@ -45,7 +44,6 @@ use crate::metrics::{
|
||||
};
|
||||
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
||||
use crate::repository::Key;
|
||||
use crate::task_mgr::BACKGROUND_RUNTIME;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
@@ -231,7 +229,7 @@ impl PostgresRedoManager {
|
||||
|
||||
// launch the WAL redo process on first use
|
||||
if process_guard.is_none() {
|
||||
let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
|
||||
let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?;
|
||||
*process_guard = Some(p);
|
||||
}
|
||||
let process = process_guard.as_mut().unwrap();
|
||||
@@ -581,8 +579,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
|
||||
/// Handle to the Postgres WAL redo process
|
||||
///
|
||||
struct PostgresRedoProcess {
|
||||
tenant_id: TenantId,
|
||||
child: NoLeakChild,
|
||||
child: Child,
|
||||
stdin: ChildStdin,
|
||||
stdout: ChildStdout,
|
||||
stderr: ChildStderr,
|
||||
@@ -592,17 +589,16 @@ impl PostgresRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
|
||||
fn launch(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: &TenantId,
|
||||
pg_version: u32,
|
||||
) -> Result<PostgresRedoProcess, Error> {
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
let datadir = path_with_suffix_extension(
|
||||
conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
|
||||
conf.tenant_path(tenant_id).join("wal-redo-datadir"),
|
||||
TEMP_FILE_SUFFIX,
|
||||
);
|
||||
|
||||
@@ -648,16 +644,18 @@ impl PostgresRedoProcess {
|
||||
),
|
||||
));
|
||||
} else {
|
||||
// Limit shared cache for wal-redo-postgres
|
||||
// Limit shared cache for wal-redo-postres
|
||||
let mut config = OpenOptions::new()
|
||||
.append(true)
|
||||
.open(PathBuf::from(&datadir).join("postgresql.conf"))?;
|
||||
config.write_all(b"shared_buffers=128kB\n")?;
|
||||
config.write_all(b"fsync=off\n")?;
|
||||
config.write_all(b"shared_preload_libraries=neon\n")?;
|
||||
config.write_all(b"neon.wal_redo=on\n")?;
|
||||
}
|
||||
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
let mut child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
.arg("--wal-redo")
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
@@ -666,17 +664,20 @@ impl PostgresRedoProcess {
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("PGDATA", &datadir)
|
||||
// The redo process is not trusted, and runs in seccomp mode that
|
||||
// doesn't allow it to open any files. We have to also make sure it
|
||||
// doesn't inherit any file descriptors from the pageserver, that
|
||||
// would allow an attacker to read any files that happen to be open
|
||||
// in the pageserver.
|
||||
// The redo process is not trusted, so it runs in seccomp mode
|
||||
// (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't
|
||||
// inherit any file descriptors from the pageserver that would allow
|
||||
// an attacker to do bad things.
|
||||
//
|
||||
// The Rust standard library makes sure to mark any file descriptors with
|
||||
// as close-on-exec by default, but that's not enough, since we use
|
||||
// libraries that directly call libc open without setting that flag.
|
||||
//
|
||||
// One example is the pidfile of the daemonize library, which doesn't
|
||||
// currently mark file descriptors as close-on-exec. Either way, we
|
||||
// want to be on the safe side and prevent accidental regression.
|
||||
.close_fds()
|
||||
.spawn_no_leak_child()
|
||||
.spawn()
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
e.kind(),
|
||||
@@ -684,33 +685,20 @@ impl PostgresRedoProcess {
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait();
|
||||
});
|
||||
info!(
|
||||
"launched WAL redo postgres process on {}",
|
||||
datadir.display()
|
||||
);
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
if let Err(e) = &res {
|
||||
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
|
||||
}
|
||||
res
|
||||
}};
|
||||
}
|
||||
set_nonblock_or_log_err!(stdin)?;
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
set_nonblock_or_log_err!(stderr)?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
set_nonblock(stdin.as_raw_fd())?;
|
||||
set_nonblock(stdout.as_raw_fd())?;
|
||||
set_nonblock(stderr.as_raw_fd())?;
|
||||
|
||||
Ok(PostgresRedoProcess {
|
||||
tenant_id,
|
||||
child,
|
||||
stdin,
|
||||
stdout,
|
||||
@@ -718,16 +706,18 @@ impl PostgresRedoProcess {
|
||||
})
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
|
||||
fn kill(self) {
|
||||
self.child.kill_and_wait();
|
||||
fn kill(mut self) {
|
||||
let _ = self.child.kill();
|
||||
if let Ok(exit_status) = self.child.wait() {
|
||||
error!("wal-redo-postgres exited with code {}", exit_status);
|
||||
}
|
||||
drop(self);
|
||||
}
|
||||
|
||||
//
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
|
||||
fn apply_wal_records(
|
||||
&mut self,
|
||||
tag: BufferTag,
|
||||
@@ -740,11 +730,7 @@ impl PostgresRedoProcess {
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
let mut writebuf: Vec<u8> = Vec::new();
|
||||
build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
build_push_page_msg(tag, &img, &mut writebuf);
|
||||
@@ -857,101 +843,8 @@ impl PostgresRedoProcess {
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper type around `std::process::Child` which guarantees that the child
|
||||
/// will be killed and waited-for by this process before being dropped.
|
||||
struct NoLeakChild {
|
||||
child: Option<Child>,
|
||||
}
|
||||
|
||||
impl Deref for NoLeakChild {
|
||||
type Target = Child;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.child.as_ref().expect("must not use from drop")
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for NoLeakChild {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.child.as_mut().expect("must not use from drop")
|
||||
}
|
||||
}
|
||||
|
||||
impl NoLeakChild {
|
||||
fn spawn(command: &mut Command) -> io::Result<Self> {
|
||||
let child = command.spawn()?;
|
||||
Ok(NoLeakChild { child: Some(child) })
|
||||
}
|
||||
|
||||
fn kill_and_wait(mut self) {
|
||||
let child = match self.child.take() {
|
||||
Some(child) => child,
|
||||
None => return,
|
||||
};
|
||||
Self::kill_and_wait_impl(child);
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(pid=child.id()))]
|
||||
fn kill_and_wait_impl(mut child: Child) {
|
||||
let res = child.kill();
|
||||
if let Err(e) = res {
|
||||
// This branch is very unlikely because:
|
||||
// - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
|
||||
// - This is the only place that calls .kill()
|
||||
// - We consume `self`, so, .kill() can't be called twice.
|
||||
// - If the process exited by itself or was killed by someone else,
|
||||
// .kill() will still succeed because we haven't wait()'ed yet.
|
||||
//
|
||||
// So, if we arrive here, we have really no idea what happened,
|
||||
// whether the PID stored in self.child is still valid, etc.
|
||||
// If this function were fallible, we'd return an error, but
|
||||
// since it isn't, all we can do is log an error and proceed
|
||||
// with the wait().
|
||||
error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
|
||||
}
|
||||
|
||||
match child.wait() {
|
||||
Ok(exit_status) => {
|
||||
// log at error level since .kill() is something we only do on errors ATM
|
||||
error!(exit_status = %exit_status, "wait successful");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for NoLeakChild {
|
||||
fn drop(&mut self) {
|
||||
let child = match self.child.take() {
|
||||
Some(child) => child,
|
||||
None => return,
|
||||
};
|
||||
// Offload the kill+wait of the child process into the background.
|
||||
// If someone stops the runtime, we'll leak the child process.
|
||||
// We can ignore that case because we only stop the runtime on pageserver exit.
|
||||
BACKGROUND_RUNTIME.spawn(async move {
|
||||
tokio::task::spawn_blocking(move || {
|
||||
Self::kill_and_wait_impl(child);
|
||||
})
|
||||
.await
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
trait NoLeakChildCommandExt {
|
||||
fn spawn_no_leak_child(&mut self) -> io::Result<NoLeakChild>;
|
||||
}
|
||||
|
||||
impl NoLeakChildCommandExt for Command {
|
||||
fn spawn_no_leak_child(&mut self) -> io::Result<NoLeakChild> {
|
||||
NoLeakChild::spawn(self)
|
||||
}
|
||||
}
|
||||
|
||||
// Functions for constructing messages to send to the postgres WAL redo
|
||||
// process. See pgxn/neon_walredo/walredoproc.c for
|
||||
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
MODULE_big = neon
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
inmem_smgr.o \
|
||||
libpagestore.o \
|
||||
libpqwalproposer.o \
|
||||
pagestore_smgr.o \
|
||||
|
||||
@@ -3,8 +3,9 @@
|
||||
* inmem_smgr.c
|
||||
*
|
||||
* This is an implementation of the SMGR interface, used in the WAL redo
|
||||
* process. It has no persistent storage, the pages that are written out
|
||||
* are kept in a small number of in-memory buffers.
|
||||
* process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
|
||||
* storage, the pages that are written out are kept in a small number of
|
||||
* in-memory buffers.
|
||||
*
|
||||
* Normally, replaying a WAL record only needs to access a handful of
|
||||
* buffers, which fit in the normal buffer cache, so this is just for
|
||||
@@ -14,11 +15,15 @@
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon/inmem_smgr.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/xlog.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "storage/block.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/relfilenode.h"
|
||||
@@ -28,8 +33,6 @@
|
||||
#include "access/xlogutils.h"
|
||||
#endif
|
||||
|
||||
#include "inmem_smgr.h"
|
||||
|
||||
/* Size of the in-memory smgr */
|
||||
#define MAX_PAGES 64
|
||||
|
||||
@@ -56,34 +59,10 @@ locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
/* neon wal-redo storage manager functionality */
|
||||
static void inmem_init(void);
|
||||
static void inmem_open(SMgrRelation reln);
|
||||
static void inmem_close(SMgrRelation reln, ForkNumber forknum);
|
||||
static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
|
||||
static bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
|
||||
static void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
|
||||
static void inmem_extend(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum);
|
||||
static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
char *buffer);
|
||||
static void inmem_write(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
static void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, BlockNumber nblocks);
|
||||
static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
|
||||
static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber nblocks);
|
||||
static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
|
||||
|
||||
|
||||
/*
|
||||
* inmem_init() -- Initialize private state
|
||||
*/
|
||||
static void
|
||||
void
|
||||
inmem_init(void)
|
||||
{
|
||||
used_pages = 0;
|
||||
@@ -92,7 +71,7 @@ inmem_init(void)
|
||||
/*
|
||||
* inmem_exists() -- Does the physical file exist?
|
||||
*/
|
||||
static bool
|
||||
bool
|
||||
inmem_exists(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
for (int i = 0; i < used_pages; i++)
|
||||
@@ -111,7 +90,7 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum)
|
||||
*
|
||||
* If isRedo is true, it's okay for the relation to exist already.
|
||||
*/
|
||||
static void
|
||||
void
|
||||
inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
|
||||
{
|
||||
}
|
||||
@@ -119,7 +98,7 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
|
||||
/*
|
||||
* inmem_unlink() -- Unlink a relation.
|
||||
*/
|
||||
static void
|
||||
void
|
||||
inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
|
||||
{
|
||||
}
|
||||
@@ -133,7 +112,7 @@ inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
|
||||
* EOF). Note that we assume writing a block beyond current EOF
|
||||
* causes intervening file space to become filled with zeroes.
|
||||
*/
|
||||
static void
|
||||
void
|
||||
inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
char *buffer, bool skipFsync)
|
||||
{
|
||||
@@ -144,7 +123,7 @@ inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
/*
|
||||
* inmem_open() -- Initialize newly-opened relation.
|
||||
*/
|
||||
static void
|
||||
void
|
||||
inmem_open(SMgrRelation reln)
|
||||
{
|
||||
}
|
||||
@@ -152,7 +131,7 @@ inmem_open(SMgrRelation reln)
|
||||
/*
|
||||
* inmem_close() -- Close the specified relation, if it isn't closed already.
|
||||
*/
|
||||
static void
|
||||
void
|
||||
inmem_close(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
}
|
||||
@@ -160,7 +139,7 @@ inmem_close(SMgrRelation reln, ForkNumber forknum)
|
||||
/*
|
||||
* inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
|
||||
*/
|
||||
static bool
|
||||
bool
|
||||
inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
{
|
||||
return true;
|
||||
@@ -169,7 +148,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
/*
|
||||
* inmem_writeback() -- Tell the kernel to write pages back to storage.
|
||||
*/
|
||||
static void
|
||||
void
|
||||
inmem_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, BlockNumber nblocks)
|
||||
{
|
||||
@@ -178,7 +157,7 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
/*
|
||||
* inmem_read() -- Read the specified block from a relation.
|
||||
*/
|
||||
static void
|
||||
void
|
||||
inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
char *buffer)
|
||||
{
|
||||
@@ -198,7 +177,7 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
* relation (ie, those before the current EOF). To extend a relation,
|
||||
* use mdextend().
|
||||
*/
|
||||
static void
|
||||
void
|
||||
inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
char *buffer, bool skipFsync)
|
||||
{
|
||||
@@ -245,7 +224,7 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
/*
|
||||
* inmem_nblocks() -- Get the number of blocks stored in a relation.
|
||||
*/
|
||||
static BlockNumber
|
||||
BlockNumber
|
||||
inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
/*
|
||||
@@ -264,7 +243,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
/*
|
||||
* inmem_truncate() -- Truncate relation to specified number of blocks.
|
||||
*/
|
||||
static void
|
||||
void
|
||||
inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
{
|
||||
}
|
||||
@@ -272,7 +251,7 @@ inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
/*
|
||||
* inmem_immedsync() -- Immediately sync a relation to stable storage.
|
||||
*/
|
||||
static void
|
||||
void
|
||||
inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
}
|
||||
@@ -40,21 +40,8 @@
|
||||
bool connected = false;
|
||||
PGconn *pageserver_conn = NULL;
|
||||
|
||||
/*
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_READABLE on pageserver_conn,
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
*/
|
||||
WaitEventSet *pageserver_conn_wes = NULL;
|
||||
|
||||
char *page_server_connstring_raw;
|
||||
|
||||
int n_unflushed_requests = 0;
|
||||
int flush_every_n_requests = 8;
|
||||
|
||||
static void pageserver_flush(void);
|
||||
|
||||
static void
|
||||
pageserver_connect()
|
||||
{
|
||||
@@ -71,7 +58,6 @@ pageserver_connect()
|
||||
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "could not establish connection to pageserver"),
|
||||
@@ -87,26 +73,22 @@ pageserver_connect()
|
||||
neon_log(ERROR, "could not send pagestream command to pageserver");
|
||||
}
|
||||
|
||||
pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
|
||||
|
||||
while (PQisBusy(pageserver_conn))
|
||||
{
|
||||
int wc;
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
wc = WaitLatchOrSocket(MyLatch,
|
||||
WL_LATCH_SET | WL_SOCKET_READABLE |
|
||||
WL_EXIT_ON_PM_DEATH,
|
||||
PQsocket(pageserver_conn),
|
||||
-1L, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* Data available in socket? */
|
||||
if (event.events & WL_SOCKET_READABLE)
|
||||
if (wc & WL_SOCKET_READABLE)
|
||||
{
|
||||
if (!PQconsumeInput(pageserver_conn))
|
||||
{
|
||||
@@ -114,7 +96,6 @@ pageserver_connect()
|
||||
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
FreeWaitEventSet(pageserver_conn_wes);
|
||||
|
||||
neon_log(ERROR, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
@@ -131,30 +112,33 @@ pageserver_connect()
|
||||
* A wrapper around PQgetCopyData that checks for interrupts while sleeping.
|
||||
*/
|
||||
static int
|
||||
call_PQgetCopyData(char **buffer)
|
||||
call_PQgetCopyData(PGconn *conn, char **buffer)
|
||||
{
|
||||
int ret;
|
||||
|
||||
retry:
|
||||
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
|
||||
ret = PQgetCopyData(conn, buffer, 1 /* async */ );
|
||||
|
||||
if (ret == 0)
|
||||
{
|
||||
int wc;
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
wc = WaitLatchOrSocket(MyLatch,
|
||||
WL_LATCH_SET | WL_SOCKET_READABLE |
|
||||
WL_EXIT_ON_PM_DEATH,
|
||||
PQsocket(conn),
|
||||
-1L, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* Data available in socket? */
|
||||
if (event.events & WL_SOCKET_READABLE)
|
||||
if (wc & WL_SOCKET_READABLE)
|
||||
{
|
||||
if (!PQconsumeInput(pageserver_conn))
|
||||
if (!PQconsumeInput(conn))
|
||||
neon_log(ERROR, "could not get response from pageserver: %s",
|
||||
PQerrorMessage(pageserver_conn));
|
||||
PQerrorMessage(conn));
|
||||
}
|
||||
|
||||
goto retry;
|
||||
@@ -180,11 +164,7 @@ pageserver_disconnect(void)
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
connected = false;
|
||||
|
||||
prefetch_on_ps_disconnect();
|
||||
}
|
||||
if (pageserver_conn_wes != NULL)
|
||||
FreeWaitEventSet(pageserver_conn_wes);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -194,7 +174,11 @@ pageserver_send(NeonRequest * request)
|
||||
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
pageserver_disconnect();
|
||||
{
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
connected = false;
|
||||
}
|
||||
|
||||
if (!connected)
|
||||
pageserver_connect();
|
||||
@@ -218,11 +202,6 @@ pageserver_send(NeonRequest * request)
|
||||
}
|
||||
pfree(req_buff.data);
|
||||
|
||||
n_unflushed_requests++;
|
||||
|
||||
if (flush_every_n_requests > 0 && n_unflushed_requests >= flush_every_n_requests)
|
||||
pageserver_flush();
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) request);
|
||||
@@ -241,7 +220,7 @@ pageserver_receive(void)
|
||||
PG_TRY();
|
||||
{
|
||||
/* read response */
|
||||
resp_buff.len = call_PQgetCopyData(&resp_buff.data);
|
||||
resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
|
||||
resp_buff.cursor = 0;
|
||||
|
||||
if (resp_buff.len < 0)
|
||||
@@ -276,21 +255,25 @@ pageserver_receive(void)
|
||||
static void
|
||||
pageserver_flush(void)
|
||||
{
|
||||
if (!connected)
|
||||
{
|
||||
neon_log(WARNING, "Tried to flush while disconnected");
|
||||
}
|
||||
else if (PQflush(pageserver_conn))
|
||||
if (PQflush(pageserver_conn))
|
||||
{
|
||||
char *msg = PQerrorMessage(pageserver_conn);
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(ERROR, "failed to flush page requests: %s", msg);
|
||||
}
|
||||
n_unflushed_requests = 0;
|
||||
}
|
||||
|
||||
static NeonResponse *
|
||||
pageserver_call(NeonRequest * request)
|
||||
{
|
||||
pageserver_send(request);
|
||||
pageserver_flush();
|
||||
return pageserver_receive();
|
||||
}
|
||||
|
||||
page_server_api api = {
|
||||
.request = pageserver_call,
|
||||
.send = pageserver_send,
|
||||
.flush = pageserver_flush,
|
||||
.receive = pageserver_receive
|
||||
@@ -436,6 +419,15 @@ pg_init_libpagestore(void)
|
||||
0, /* no flags required */
|
||||
check_neon_id, NULL, NULL);
|
||||
|
||||
DefineCustomBoolVariable("neon.wal_redo",
|
||||
"start in wal-redo mode",
|
||||
NULL,
|
||||
&wal_redo,
|
||||
false,
|
||||
PGC_POSTMASTER,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.max_cluster_size",
|
||||
"cluster size limit",
|
||||
NULL,
|
||||
@@ -444,14 +436,6 @@ pg_init_libpagestore(void)
|
||||
PGC_SIGHUP,
|
||||
GUC_UNIT_MB,
|
||||
NULL, NULL, NULL);
|
||||
DefineCustomIntVariable("neon.flush_output_after",
|
||||
"Flush the output buffer after every N unflushed requests",
|
||||
NULL,
|
||||
&flush_every_n_requests,
|
||||
8, -1, INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
relsize_hash_init();
|
||||
|
||||
@@ -468,7 +452,13 @@ pg_init_libpagestore(void)
|
||||
neon_timeline_walproposer = neon_timeline;
|
||||
neon_tenant_walproposer = neon_tenant;
|
||||
|
||||
if (page_server_connstring && page_server_connstring[0])
|
||||
if (wal_redo)
|
||||
{
|
||||
neon_log(PageStoreTrace, "set inmem_smgr hook");
|
||||
smgr_hook = smgr_inmem;
|
||||
smgr_init_hook = smgr_init_inmem;
|
||||
}
|
||||
else if (page_server_connstring && page_server_connstring[0])
|
||||
{
|
||||
neon_log(PageStoreTrace, "set neon_smgr hook");
|
||||
smgr_hook = smgr_neon;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user