Compare commits

..

1 Commits

Author SHA1 Message Date
Bojan Serafimov
47d29613a7 Use dashmap in page cache 2022-10-19 17:16:02 -04:00
198 changed files with 3430 additions and 11730 deletions

View File

@@ -55,22 +55,6 @@ runs:
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
path: /tmp/neon
- name: Download Neon binaries for the previous release
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
path: /tmp/neon-previous
prefix: latest
- name: Download compatibility snapshot for Postgres 14
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: compatibility-snapshot-${{ inputs.build_type }}-pg14
path: /tmp/compatibility_snapshot_pg14
prefix: latest
- name: Checkout
if: inputs.needs_postgres_source == 'true'
uses: actions/checkout@v3
@@ -92,15 +76,10 @@ runs:
- name: Run pytest
env:
NEON_BIN: /tmp/neon/bin
COMPATIBILITY_NEON_BIN: /tmp/neon-previous/bin
COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: ${{ inputs.build_type }}
AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
shell: bash -euxo pipefail {0}
run: |
# PLATFORM will be embedded in the perf test report
@@ -123,12 +102,7 @@ runs:
exit 1
fi
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
# -n4 uses four processes to run tests via pytest-xdist
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
# to the same worker to make @pytest.mark.order work with xdist
EXTRA_PARAMS="--dist=loadgroup $EXTRA_PARAMS"
fi
if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
@@ -163,9 +137,9 @@ runs:
# --verbose prints name of each test (helpful when there are
# multiple tests in one file)
# -rA prints summary in the end
# -n4 uses four processes to run tests via pytest-xdist
# -s is not used to prevent pytest from capturing output, because tests are running
# in parallel and logs are mixed between different tests
#
mkdir -p $TEST_OUTPUT/allure/results
"${cov_prefix[@]}" ./scripts/pytest \
--junitxml=$TEST_OUTPUT/junit.xml \
@@ -180,17 +154,8 @@ runs:
scripts/generate_and_push_perf_report.sh
fi
- name: Upload compatibility snapshot for Postgres 14
if: github.ref_name == 'release'
uses: ./.github/actions/upload
with:
name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
# The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
prefix: latest
- name: Create Allure report
if: success() || failure()
if: always()
uses: ./.github/actions/allure-report
with:
action: store

View File

@@ -3,6 +3,7 @@ storage:
bucket_name: neon-storage-ireland
bucket_region: eu-west-1
console_mgmt_base_url: http://neon-stress-console.local
env_name: neon-stress
etcd_endpoints: neon-stress-etcd.local:2379
safekeeper_enable_s3_offload: 'false'
pageserver_config_stub:
@@ -11,7 +12,6 @@ storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "{{ inventory_hostname }}"
safekeeper_s3_prefix: neon-stress/wal
hostname_suffix: ".local"
remote_user: admin
children:

View File

@@ -1,35 +0,0 @@
storage:
vars:
bucket_name: neon-prod-storage-ap-southeast-1
bucket_region: ap-southeast-1
console_mgmt_base_url: http://console-release.local
etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "pageserver/v1"
safekeeper_s3_prefix: safekeeper/v1/wal
hostname_suffix: ""
remote_user: ssm-user
ansible_aws_ssm_region: ap-southeast-1
ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
console_region_id: aws-ap-southeast-1
children:
pageservers:
hosts:
pageserver-0.ap-southeast-1.aws.neon.tech:
ansible_host: i-064de8ea28bdb495b
pageserver-1.ap-southeast-1.aws.neon.tech:
ansible_host: i-0b180defcaeeb6b93
safekeepers:
hosts:
safekeeper-0.ap-southeast-1.aws.neon.tech:
ansible_host: i-0d6f1dc5161eef894
safekeeper-1.ap-southeast-1.aws.neon.tech:
ansible_host: i-0e338adda8eb2d19f
safekeeper-2.ap-southeast-1.aws.neon.tech:
ansible_host: i-04fb63634e4679eb9

View File

@@ -1,35 +0,0 @@
storage:
vars:
bucket_name: neon-prod-storage-eu-central-1
bucket_region: eu-central-1
console_mgmt_base_url: http://console-release.local
etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "pageserver/v1"
safekeeper_s3_prefix: safekeeper/v1/wal
hostname_suffix: ""
remote_user: ssm-user
ansible_aws_ssm_region: eu-central-1
ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
console_region_id: aws-eu-central-1
children:
pageservers:
hosts:
pageserver-0.eu-central-1.aws.neon.tech:
ansible_host: i-0cd8d316ecbb715be
pageserver-1.eu-central-1.aws.neon.tech:
ansible_host: i-090044ed3d383fef0
safekeepers:
hosts:
safekeeper-0.eu-central-1.aws.neon.tech:
ansible_host: i-0b238612d2318a050
safekeeper-1.eu-central-1.aws.neon.tech:
ansible_host: i-07b9c45e5c2637cd4
safekeeper-2.eu-central-1.aws.neon.tech:
ansible_host: i-020257302c3c93d88

View File

@@ -1,36 +0,0 @@
storage:
vars:
bucket_name: neon-prod-storage-us-east-2
bucket_region: us-east-2
console_mgmt_base_url: http://console-release.local
etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "pageserver/v1"
safekeeper_s3_prefix: safekeeper/v1/wal
hostname_suffix: ""
remote_user: ssm-user
ansible_aws_ssm_region: us-east-2
ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
console_region_id: aws-us-east-2
children:
pageservers:
hosts:
pageserver-0.us-east-2.aws.neon.tech:
ansible_host: i-062227ba7f119eb8c
pageserver-1.us-east-2.aws.neon.tech:
ansible_host: i-0b3ec0afab5968938
safekeepers:
hosts:
safekeeper-0.us-east-2.aws.neon.tech:
ansible_host: i-0e94224750c57d346
safekeeper-1.us-east-2.aws.neon.tech:
ansible_host: i-06d113fb73bfddeb0
safekeeper-2.us-east-2.aws.neon.tech:
ansible_host: i-09f66c8e04afff2e8

View File

@@ -1,6 +1,7 @@
---
storage:
vars:
env_name: prod-1
console_mgmt_base_url: http://console-release.local
bucket_name: zenith-storage-oregon
bucket_region: us-west-2
@@ -11,7 +12,6 @@ storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "{{ inventory_hostname }}"
safekeeper_s3_prefix: prod-1/wal
hostname_suffix: ".local"
remote_user: admin
@@ -22,10 +22,6 @@ storage:
console_region_id: aws-us-west-2
zenith-1-ps-3:
console_region_id: aws-us-west-2
zenith-1-ps-4:
console_region_id: aws-us-west-2
zenith-1-ps-5:
console_region_id: aws-us-west-2
safekeepers:
hosts:

View File

@@ -1,2 +1,3 @@
ansible_connection: aws_ssm
ansible_aws_ssm_bucket_name: neon-dev-bucket
ansible_python_interpreter: /usr/bin/python3

View File

@@ -3,14 +3,14 @@ storage:
bucket_name: zenith-staging-storage-us-east-1
bucket_region: us-east-1
console_mgmt_base_url: http://console-staging.local
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
env_name: us-stage
etcd_endpoints: zenith-us-stage-etcd.local:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "{{ inventory_hostname }}"
safekeeper_s3_prefix: us-stage/wal
hostname_suffix: ".local"
remote_user: admin

View File

@@ -3,6 +3,7 @@ storage:
bucket_name: neon-staging-storage-us-east-2
bucket_region: us-east-2
console_mgmt_base_url: http://console-staging.local
env_name: us-stage
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
@@ -10,11 +11,9 @@ storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "pageserver/v1"
safekeeper_s3_prefix: safekeeper/v1/wal
hostname_suffix: ""
remote_user: ssm-user
ansible_aws_ssm_region: us-east-2
ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
console_region_id: aws-us-east-2
children:

View File

@@ -6,7 +6,7 @@ After=network.target auditd.service
Type=simple
User=safekeeper
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed
KillSignal=SIGINT

View File

@@ -1,31 +0,0 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.us-east-2.aws.neon.build"
# -- Additional labels for neon-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: dev
zenith_region: us-east-2
zenith_region_slug: us-east-2
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
#metrics:
# enabled: true
# serviceMonitor:
# enabled: true
# selector:
# release: kube-prometheus-stack

View File

@@ -1,31 +0,0 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.ap-southeast-1.aws.neon.tech"
# -- Additional labels for neon-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: prod
zenith_region: ap-southeast-1
zenith_region_slug: ap-southeast-1
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
#metrics:
# enabled: true
# serviceMonitor:
# enabled: true
# selector:
# release: kube-prometheus-stack

View File

@@ -1,31 +0,0 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.eu-central-1.aws.neon.tech"
# -- Additional labels for neon-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: prod
zenith_region: eu-central-1
zenith_region_slug: eu-central-1
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
#metrics:
# enabled: true
# serviceMonitor:
# enabled: true
# selector:
# release: kube-prometheus-stack

View File

@@ -1,31 +0,0 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.us-east-2.aws.neon.tech"
# -- Additional labels for neon-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: prod
zenith_region: us-east-2
zenith_region_slug: us-east-2
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
#metrics:
# enabled: true
# serviceMonitor:
# enabled: true
# selector:
# release: kube-prometheus-stack

View File

@@ -164,7 +164,7 @@ jobs:
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
PLATFORM: ${{ matrix.platform }}
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
options: --init
@@ -265,7 +265,7 @@ jobs:
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Create Allure report
if: success() || failure()
if: always()
uses: ./.github/actions/allure-report
with:
action: generate

View File

@@ -18,8 +18,8 @@ env:
jobs:
tag:
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
@@ -46,7 +46,7 @@ jobs:
id: build-tag
build-neon:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -127,8 +127,8 @@ jobs:
target/
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
key: |
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-
v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-
- name: Cache postgres v14 build
id: cache_pg_14
@@ -236,7 +236,7 @@ jobs:
uses: ./.github/actions/save-coverage-data
regress-tests:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -268,8 +268,34 @@ jobs:
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data
upload-latest-artifacts:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ regress-tests ]
if: github.ref_name == 'main'
steps:
- name: Copy Neon artifact to the latest directory
shell: bash -euxo pipefail {0}
env:
BUCKET: neon-github-public-dev
PREFIX: artifacts/${{ github.run_id }}
run: |
for build_type in debug release; do
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
if [ -z "${S3_KEY}" ]; then
echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
exit 1
fi
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME}
done
benchmarks:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -300,12 +326,12 @@ jobs:
# while coverage is currently collected for the debug ones
merge-allure-report:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ regress-tests, benchmarks ]
if: success() || failure()
if: always()
strategy:
fail-fast: false
matrix:
@@ -338,7 +364,7 @@ jobs:
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
coverage-report:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -363,7 +389,7 @@ jobs:
!~/.cargo/registry/src
~/.cargo/git/
target/
key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
- name: Get Neon artifact
uses: ./.github/actions/download
@@ -415,19 +441,15 @@ jobs:
shell: bash -euxo pipefail {0}
trigger-e2e-tests:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
needs: [ push-docker-hub, tag ]
needs: [ build-neon ]
steps:
- name: Set PR's status to pending and request a remote CI test
run: |
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
# to place a job run status update later.
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
REMOTE_REPO="${{ github.repository_owner }}/cloud"
@@ -453,15 +475,12 @@ jobs:
\"inputs\": {
\"ci_job_name\": \"neon-cloud-e2e\",
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\",
\"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
\"remote_repo\": \"${{ github.repository }}\"
}
}"
neon-image:
runs-on: [ self-hosted, dev, x64 ]
needs: [ tag ]
runs-on: dev
container: gcr.io/kaniko-project/executor:v1.9.0-debug
steps:
@@ -475,11 +494,10 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build neon
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID
compute-tools-image:
runs-on: [ self-hosted, dev, x64 ]
needs: [ tag ]
runs-on: dev
container: gcr.io/kaniko-project/executor:v1.9.0-debug
steps:
@@ -490,12 +508,30 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build compute tools
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID
compute-node-image:
runs-on: dev
container: gcr.io/kaniko-project/executor:v1.9.0-debug
steps:
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
with:
submodules: true
fetch-depth: 0
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
# compute-node uses postgres 14, which is default now
# cloud repo depends on this image name, thus duplicating it
# remove compute-node when cloud repo is updated
- name: Kaniko build compute node with extensions v14 (compatibility)
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
compute-node-image-v14:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container: gcr.io/kaniko-project/executor:v1.9.0-debug
needs: [ tag ]
steps:
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -507,12 +543,12 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build compute node with extensions v14
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID
compute-node-image-v15:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container: gcr.io/kaniko-project/executor:v1.9.0-debug
needs: [ tag ]
steps:
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -524,69 +560,28 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build compute node with extensions v15
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}
test-images:
needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
runs-on: [ self-hosted, dev, x64 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
# `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
# Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
# Regular pageserver version string looks like
# Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: []
# Bad versions might loop like:
# Neon page server git-env:local failpoints: true, features: ["testing"]
# Ensure that we don't have bad versions.
- name: Verify image versions
shell: bash # ensure no set -e for better error messages
run: |
pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
echo "Pageserver version string: $pageserver_version"
if ! echo "$pageserver_version" | grep -qv 'git-env:local' ; then
echo "Pageserver version should not be the default Dockerfile one"
exit 1
fi
if ! echo "$pageserver_version" | grep -qv '"testing"' ; then
echo "Pageserver version should have no testing feature enabled"
exit 1
fi
- name: Verify docker-compose example
run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
- name: Print logs and clean up
if: always()
run: |
docker compose -f ./docker-compose/docker-compose.yml logs || 0
docker compose -f ./docker-compose/docker-compose.yml down
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID
promote-images:
runs-on: [ self-hosted, dev, x64 ]
needs: [ tag, test-images ]
runs-on: dev
needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
if: github.event_name != 'workflow_dispatch'
container: amazon/aws-cli
strategy:
fail-fast: false
matrix:
name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]
# compute-node uses postgres 14, which is default now
# cloud repo depends on this image name, thus duplicating it
# remove compute-node when cloud repo is updated
name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ]
steps:
- name: Promote image to latest
run: |
export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text)
aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
run:
MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=$GITHUB_RUN_ID --query 'images[].imageManifest' --output text) && aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
push-docker-hub:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
needs: [ promote-images, tag ]
container: golang:1.19-bullseye
@@ -602,16 +597,19 @@ jobs:
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Pull neon image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:latest neon
- name: Pull compute tools image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest compute-tools
- name: Pull compute node image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node
- name: Pull compute node v14 image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14
- name: Pull compute node v15 image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest compute-node-v15
- name: Pull rust image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
@@ -621,10 +619,11 @@ jobs:
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
run: |
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
- name: Configure Docker Hub login
run: |
@@ -638,6 +637,9 @@ jobs:
- name: Push compute tools image to Docker Hub
run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
- name: Push compute node image to Docker Hub
run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
- name: Push compute node v14 image to Docker Hub
run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
@@ -654,6 +656,7 @@ jobs:
run: |
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
@@ -736,7 +739,7 @@ jobs:
rm -f neon_install.tar.gz .neon_current_version
deploy-new:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -747,9 +750,9 @@ jobs:
defaults:
run:
shell: bash
strategy:
matrix:
target_region: [ us-east-2 ]
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -772,51 +775,11 @@ jobs:
fi
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
rm -f neon_install.tar.gz .neon_current_version
deploy-prod-new:
runs-on: prod
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Redeploy
run: |
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
cd "$(pwd)/.github/ansible"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
./get_binaries.sh
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
RELEASE=true ./get_binaries.sh
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
rm -f neon_install.tar.gz .neon_current_version
deploy-proxy:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -856,109 +819,3 @@ jobs:
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
deploy-proxy-new:
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'main') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
include:
- target_region: us-east-2
target_cluster: dev-us-east-2-beta
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Configure environment
run: |
helm repo add neondatabase https://neondatabase.github.io/helm-charts
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
deploy-proxy-prod-new:
runs-on: prod
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
include:
- target_region: us-east-2
target_cluster: prod-us-east-2-delta
- target_region: eu-central-1
target_cluster: prod-eu-central-1-gamma
- target_region: ap-southeast-1
target_cluster: prod-ap-southeast-1-epsilon
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Configure environment
run: |
helm repo add neondatabase https://neondatabase.github.io/helm-charts
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
promote-compatibility-data:
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ deploy, deploy-proxy ]
if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
steps:
- name: Promote compatibility snapshot for the release
shell: bash -euxo pipefail {0}
env:
BUCKET: neon-github-public-dev
PREFIX: artifacts/latest
run: |
# Update compatibility snapshot for the release
for build_type in debug release; do
OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
done
# Update Neon artifact for the release (reuse already uploaded artifact)
for build_type in debug release; do
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
if [ -z "${S3_KEY}" ]; then
echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
exit 1
fi
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
done

View File

@@ -106,7 +106,7 @@ jobs:
!~/.cargo/registry/src
~/.cargo/git
target
key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
- name: Run cargo clippy
run: ./run_clippy.sh
@@ -115,7 +115,7 @@ jobs:
run: cargo build --locked --all --all-targets
check-rust-dependencies:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init

2
.gitmodules vendored
View File

@@ -1,7 +1,7 @@
[submodule "vendor/postgres-v14"]
path = vendor/postgres-v14
url = https://github.com/neondatabase/postgres.git
branch = REL_14_STABLE_neon
branch = main
[submodule "vendor/postgres-v15"]
path = vendor/postgres-v15
url = https://github.com/neondatabase/postgres.git

View File

@@ -1,10 +0,0 @@
/compute_tools/ @neondatabase/control-plane
/control_plane/ @neondatabase/compute @neondatabase/storage
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/pageserver/ @neondatabase/compute @neondatabase/storage
/pgxn/ @neondatabase/compute
/proxy/ @neondatabase/control-plane
/safekeeper/ @neondatabase/safekeepers

94
Cargo.lock generated
View File

@@ -317,6 +317,12 @@ dependencies = [
"generic-array",
]
[[package]]
name = "boxfnonce"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426"
[[package]]
name = "bstr"
version = "1.0.1"
@@ -594,7 +600,6 @@ dependencies = [
"tar",
"thiserror",
"toml",
"url",
"utils",
"workspace_hack",
]
@@ -844,6 +849,16 @@ dependencies = [
"syn",
]
[[package]]
name = "daemonize"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815"
dependencies = [
"boxfnonce",
"libc",
]
[[package]]
name = "darling"
version = "0.14.1"
@@ -879,6 +894,19 @@ dependencies = [
"syn",
]
[[package]]
name = "dashmap"
version = "5.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
dependencies = [
"cfg-if",
"hashbrown",
"lock_api",
"once_cell",
"parking_lot_core 0.9.3",
]
[[package]]
name = "data-encoding"
version = "2.3.2"
@@ -2125,6 +2153,8 @@ dependencies = [
"crc32c",
"criterion",
"crossbeam-utils",
"daemonize",
"dashmap",
"etcd_broker",
"fail",
"futures",
@@ -2145,7 +2175,6 @@ dependencies = [
"postgres-types",
"postgres_ffi",
"pprof",
"pq_proto",
"rand",
"regex",
"remote_storage",
@@ -2155,10 +2184,8 @@ dependencies = [
"serde_json",
"serde_with",
"signal-hook",
"svg_fmt",
"tar",
"tempfile",
"tenant_size_model",
"thiserror",
"tokio",
"tokio-postgres",
@@ -2175,11 +2202,7 @@ dependencies = [
name = "pageserver_api"
version = "0.1.0"
dependencies = [
"anyhow",
"byteorder",
"bytes",
"const_format",
"postgres_ffi",
"serde",
"serde_with",
"utils",
@@ -2439,21 +2462,6 @@ version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
[[package]]
name = "pq_proto"
version = "0.1.0"
dependencies = [
"anyhow",
"bytes",
"pin-project-lite",
"postgres-protocol",
"rand",
"serde",
"tokio",
"tracing",
"workspace_hack",
]
[[package]]
name = "prettyplease"
version = "0.1.21"
@@ -2586,7 +2594,6 @@ dependencies = [
"once_cell",
"parking_lot 0.12.1",
"pin-project-lite",
"pq_proto",
"rand",
"rcgen",
"reqwest",
@@ -3090,6 +3097,7 @@ dependencies = [
"clap 4.0.15",
"const_format",
"crc32c",
"daemonize",
"etcd_broker",
"fs2",
"git-version",
@@ -3097,13 +3105,11 @@ dependencies = [
"humantime",
"hyper",
"metrics",
"nix 0.25.0",
"once_cell",
"parking_lot 0.12.1",
"postgres",
"postgres-protocol",
"postgres_ffi",
"pq_proto",
"regex",
"remote_storage",
"safekeeper_api",
@@ -3469,12 +3475,6 @@ version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
[[package]]
name = "svg_fmt"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
[[package]]
name = "symbolic-common"
version = "8.8.0"
@@ -3552,13 +3552,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "tenant_size_model"
version = "0.1.0"
dependencies = [
"workspace_hack",
]
[[package]]
name = "termcolor"
version = "1.1.3"
@@ -3953,16 +3946,6 @@ dependencies = [
"tracing-core",
]
[[package]]
name = "tracing-serde"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
dependencies = [
"serde",
"tracing-core",
]
[[package]]
name = "tracing-subscriber"
version = "0.3.16"
@@ -3973,15 +3956,12 @@ dependencies = [
"nu-ansi-term",
"once_cell",
"regex",
"serde",
"serde_json",
"sharded-slab",
"smallvec",
"thread_local",
"tracing",
"tracing-core",
"tracing-log",
"tracing-serde",
]
[[package]]
@@ -4064,7 +4044,9 @@ dependencies = [
"metrics",
"nix 0.25.0",
"once_cell",
"pq_proto",
"pin-project-lite",
"postgres",
"postgres-protocol",
"rand",
"routerify",
"rustls",
@@ -4074,8 +4056,6 @@ dependencies = [
"serde_json",
"serde_with",
"signal-hook",
"strum",
"strum_macros",
"tempfile",
"thiserror",
"tokio",
@@ -4389,9 +4369,6 @@ dependencies = [
"crossbeam-utils",
"either",
"fail",
"futures-channel",
"futures-task",
"futures-util",
"hashbrown",
"indexmap",
"libc",
@@ -4405,7 +4382,6 @@ dependencies = [
"rand",
"regex",
"regex-syntax",
"reqwest",
"scopeguard",
"serde",
"stable_deref_trait",

View File

@@ -25,10 +25,6 @@ members = [
# Besides, debug info should not affect the performance.
debug = true
# disable debug symbols for all packages except this one to decrease binaries size
[profile.release.package."*"]
debug = false
[profile.release-line-debug]
inherits = "release"
debug = 1 # true = 2 = all symbols, 1 = line only

View File

@@ -44,7 +44,7 @@ COPY . .
# Show build caching stats to check if it was used in the end.
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
RUN set -e \
&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin proxy --locked --release \
&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \
&& cachepot -s
# Build final image
@@ -65,7 +65,6 @@ RUN set -e \
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin

View File

@@ -1,50 +1,50 @@
#
# This file is identical to the Dockerfile.compute-node-v15 file
# except for the version of Postgres that is built.
#
ARG TAG=pinned
# apparently, ARGs don't get replaced in RUN commands in kaniko
# ARG POSTGIS_VERSION=3.3.0
# ARG PLV8_VERSION=3.1.4
# ARG PG_VERSION=v14
#########################################################################################
#
# Layer "build-deps"
#
#########################################################################################
FROM debian:bullseye-slim AS build-deps
RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
apt update
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
#########################################################################################
#
# Layer "pg-build"
# Build Postgres from the neon postgres repository.
#
#########################################################################################
FROM build-deps AS pg-build
COPY vendor/postgres-v14 postgres
RUN cd postgres && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
#########################################################################################
#
# Layer "postgis-build"
# Build PostGIS from the upstream PostGIS mirror.
#
#########################################################################################
# PostGIS compiles against neon postgres sources without changes. Perhaps we
# could even use the upstream binaries, compiled against vanilla Postgres, but
# it would require some investigation to check that it works, and also keeps
# working in the future. So for now, we compile our own binaries.
FROM build-deps AS postgis-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
tar xvzf postgis-3.3.1.tar.gz && \
cd postgis-3.3.1 && \
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
tar xvzf postgis-3.3.0.tar.gz && \
cd postgis-3.3.0 && \
./autogen.sh && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
./configure && \
@@ -57,29 +57,19 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
#########################################################################################
#
# Layer "plv8-build"
# Build plv8
#
#########################################################################################
FROM build-deps AS plv8-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
# https://github.com/plv8/plv8/issues/475:
# v8 uses gold for linking and sets `--thread-count=4` which breaks
# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
# Install newer gold version manually as debian-testing binutils version updates
# libc version, which in turn breaks other extension built against non-testing libc.
RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
tar xvzf binutils-2.38.tar.gz && \
cd binutils-2.38 && \
cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
cd ../bfd && ./configure && make bfdver.h && \
cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
cp /usr/local/bin/ld.gold /usr/bin/gold
# https://github.com/plv8/plv8/issues/475
# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
RUN apt update && \
apt install -y --no-install-recommends -t testing binutils
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -87,25 +77,21 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
cd plv8-3.1.4 && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
rm -rf /plv8-* && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
#########################################################################################
#
# Layer "h3-pg-build"
# Build h3_pg
#
#########################################################################################
FROM build-deps AS h3-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# packaged cmake is too old
RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
-q -O /tmp/cmake-install.sh \
&& chmod u+x /tmp/cmake-install.sh \
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
&& rm /tmp/cmake-install.sh
RUN apt update && \
apt install -y --no-install-recommends -t testing cmake
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
tar xvzf h3.tgz && \
@@ -124,15 +110,12 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
export PATH="/usr/local/pgsql/bin:$PATH" && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
#########################################################################################
#
# Layer "neon-pg-ext-build"
# compile neon extensions
#
#########################################################################################
FROM build-deps AS neon-pg-ext-build
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -145,22 +128,16 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
-C pgxn/neon \
-s install
#########################################################################################
#
# Compile and run the Neon-specific `compute_ctl` binary
#
#########################################################################################
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
#########################################################################################
#
# Clean up postgres folder before inclusion
#
#########################################################################################
FROM neon-pg-ext-build AS postgres-cleanup-layer
COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
@@ -178,12 +155,10 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
# if they were to be used by other libraries.
RUN rm /usr/local/pgsql/lib/lib*.a
#########################################################################################
#
# Final layer
# Put it all together into the final image
#
#########################################################################################
FROM debian:bullseye-slim
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -200,6 +175,8 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
# libreadline8 for psql
# libossp-uuid16 for extension ossp-uuid
# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
# GLIBC 2.34 for plv8.
# Debian bullseye provides GLIBC 2.31, so we install the library from testing
#
# Lastly, link compute_ctl into zenith_ctl while we're at it,
# so that we don't need to put this in another layer.
@@ -212,6 +189,12 @@ RUN apt update && \
libproj19 \
libprotobuf-c1 && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
echo "Installing GLIBC 2.34" && \
echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
apt update && \
apt install -y --no-install-recommends -t testing libc6 && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
USER postgres

View File

@@ -4,39 +4,44 @@
#
ARG TAG=pinned
# apparently, ARGs don't get replaced in RUN commands in kaniko
# ARG POSTGIS_VERSION=3.3.1
# ARG PLV8_VERSION=3.1.4
# ARG PG_VERSION=v15
#########################################################################################
#
# Layer "build-deps"
#
#########################################################################################
FROM debian:bullseye-slim AS build-deps
RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
apt update
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
#########################################################################################
#
# Layer "pg-build"
# Build Postgres from the neon postgres repository.
#
#########################################################################################
FROM build-deps AS pg-build
COPY vendor/postgres-v15 postgres
RUN cd postgres && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
#########################################################################################
#
# Layer "postgis-build"
# Build PostGIS from the upstream PostGIS mirror.
#
#########################################################################################
# PostGIS compiles against neon postgres sources without changes. Perhaps we
# could even use the upstream binaries, compiled against vanilla Postgres, but
# it would require some investigation to check that it works, and also keeps
# working in the future. So for now, we compile our own binaries.
FROM build-deps AS postgis-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
@@ -57,29 +62,19 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
#########################################################################################
#
# Layer "plv8-build"
# Build plv8
#
#########################################################################################
FROM build-deps AS plv8-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
# https://github.com/plv8/plv8/issues/475:
# v8 uses gold for linking and sets `--thread-count=4` which breaks
# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
# Install newer gold version manually as debian-testing binutils version updates
# libc version, which in turn breaks other extension built against non-testing libc.
RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
tar xvzf binutils-2.38.tar.gz && \
cd binutils-2.38 && \
cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
cd ../bfd && ./configure && make bfdver.h && \
cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
cp /usr/local/bin/ld.gold /usr/bin/gold
# https://github.com/plv8/plv8/issues/475
# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
RUN apt update && \
apt install -y --no-install-recommends -t testing binutils
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -87,25 +82,21 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
cd plv8-3.1.4 && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
rm -rf /plv8-* && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
#########################################################################################
#
# Layer "h3-pg-build"
# Build h3_pg
#
#########################################################################################
FROM build-deps AS h3-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# packaged cmake is too old
RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
-q -O /tmp/cmake-install.sh \
&& chmod u+x /tmp/cmake-install.sh \
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
&& rm /tmp/cmake-install.sh
RUN apt update && \
apt install -y --no-install-recommends -t testing cmake
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
tar xvzf h3.tgz && \
@@ -124,15 +115,12 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
export PATH="/usr/local/pgsql/bin:$PATH" && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
#########################################################################################
#
# Layer "neon-pg-ext-build"
# compile neon extensions
#
#########################################################################################
FROM build-deps AS neon-pg-ext-build
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -145,22 +133,16 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
-C pgxn/neon \
-s install
#########################################################################################
#
# Compile and run the Neon-specific `compute_ctl` binary
#
#########################################################################################
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
#########################################################################################
#
# Clean up postgres folder before inclusion
#
#########################################################################################
FROM neon-pg-ext-build AS postgres-cleanup-layer
COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
@@ -178,12 +160,10 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
# if they were to be used by other libraries.
RUN rm /usr/local/pgsql/lib/lib*.a
#########################################################################################
#
# Final layer
# Put it all together into the final image
#
#########################################################################################
FROM debian:bullseye-slim
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -200,6 +180,8 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
# libreadline8 for psql
# libossp-uuid16 for extension ossp-uuid
# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
# GLIBC 2.34 for plv8.
# Debian bullseye provides GLIBC 2.31, so we install the library from testing
#
# Lastly, link compute_ctl into zenith_ctl while we're at it,
# so that we don't need to put this in another layer.
@@ -212,6 +194,12 @@ RUN apt update && \
libproj19 \
libprotobuf-c1 && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
echo "Installing GLIBC 2.34" && \
echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
apt update && \
apt install -y --no-install-recommends -t testing libc6 && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
USER postgres

View File

@@ -0,0 +1,88 @@
#
# Legacy version of the Dockerfile for the compute node.
# Used by e2e CI. Building Dockerfile.compute-node will take
# unreasonable ammount of time without v2 runners.
#
# TODO: remove once cloud repo CI is moved to v2 runners.
#
# Allow specifiyng different compute-tools tag and image repo, so we are
# able to use different images
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG IMAGE=compute-tools
ARG TAG=latest
#
# Image with pre-built tools
#
FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
# Only to get ready compute_ctl binary as deppendency
#
# Image with Postgres build deps
#
FROM debian:bullseye-slim AS build-deps
RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
libcurl4-openssl-dev libossp-uuid-dev
#
# Image with built Postgres
#
FROM build-deps AS pg-build
# Add user postgres
RUN adduser postgres
RUN mkdir /pg && chown postgres:postgres /pg
# Copy source files
# version 14 is default for now
COPY ./vendor/postgres-v14 /pg/
COPY ./pgxn /pg/
# Build and install Postgres locally
RUN mkdir /pg/compute_build && cd /pg/compute_build && \
../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
# Install main binaries and contribs
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
# Install neon contrib
RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install
USER postgres
WORKDIR /pg
#
# Final compute node image to be exported
#
FROM debian:bullseye-slim
# libreadline-dev is required to run psql
RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
chown -R postgres:postgres /var/db/postgres && \
chmod 0750 /var/db/postgres/compute
# Copy ready Postgres binaries
COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
# Copy binaries from compute-tools
COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
# XXX: temporary symlink for compatibility with old control-plane
RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
# Add postgres shared objects to the search path
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

View File

@@ -151,11 +151,6 @@ neon-pg-ext-v14: postgres-v14
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+@echo "Compiling neon_walredo v14"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
+@echo "Compiling neon_test_utils" v14
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
@@ -168,11 +163,6 @@ neon-pg-ext-v15: postgres-v15
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+@echo "Compiling neon_walredo v15"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
+@echo "Compiling neon_test_utils" v15
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \

View File

@@ -223,7 +223,10 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
```sh
git clone --recursive https://github.com/neondatabase/neon.git
# either:
CARGO_BUILD_FLAGS="--features=testing" make
# or:
make debug
./scripts/pytest
```

View File

@@ -65,7 +65,7 @@ impl GenericOption {
let name = match self.name.as_str() {
"safekeepers" => "neon.safekeepers",
"wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
"wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout",
"wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout",
it => it,
};

View File

@@ -424,29 +424,8 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
db_client.simple_query(&alter_query)?;
// Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
// This is needed because since postgres 15 this privilege is removed by default.
let grant_query = "DO $$\n\
BEGIN\n\
IF EXISTS(\n\
SELECT nspname\n\
FROM pg_catalog.pg_namespace\n\
WHERE nspname = 'public'\n\
) AND\n\
current_setting('server_version_num')::int/10000 >= 15\n\
THEN\n\
IF EXISTS(\n\
SELECT rolname\n\
FROM pg_catalog.pg_roles\n\
WHERE rolname = 'web_access'\n\
)\n\
THEN\n\
GRANT CREATE ON SCHEMA public TO web_access;\n\
END IF;\n\
END IF;\n\
END\n\
$$;"
.to_string();
// This is needed since postgres 15, where this privilege is removed by default.
let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string();
info!("grant query for db {} : {}", &db.name, &grant_query);
db_client.simple_query(&grant_query)?;
}

View File

@@ -4,21 +4,20 @@ version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1.0"
clap = "4.0"
comfy-table = "6.1"
git-version = "0.3.5"
nix = "0.25"
once_cell = "1.13.0"
tar = "0.4.38"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
regex = "1"
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
serde = { version = "1.0", features = ["derive"] }
serde_with = "2.0"
tar = "0.4.38"
thiserror = "1"
toml = "0.5"
url = "2.2.2"
once_cell = "1.13.0"
regex = "1"
anyhow = "1.0"
thiserror = "1"
nix = "0.25"
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
# instead, so that recompile times are better.

View File

@@ -1,264 +0,0 @@
//! Spawns and kills background processes that are needed by Neon CLI.
//! Applies common set-up such as log and pid files (if needed) to every process.
//!
//! Neon CLI does not run in background, so it needs to store the information about
//! spawned processes, which it does in this module.
//! We do that by storing the pid of the process in the "${process_name}.pid" file.
//! The pid file can be created by the process itself
//! (Neon storage binaries do that and also ensure that a lock is taken onto that file)
//! or we create such file after starting the process
//! (non-Neon binaries don't necessarily follow our pidfile conventions).
//! The pid stored in the file is later used to stop the service.
//!
//! See [`lock_file`] module for more info.
use std::ffi::OsStr;
use std::io::Write;
use std::path::Path;
use std::process::{Child, Command};
use std::time::Duration;
use std::{fs, io, thread};
use anyhow::{anyhow, bail, Context, Result};
use nix::errno::Errno;
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use utils::lock_file;
const RETRIES: u32 = 15;
const RETRY_TIMEOUT_MILLIS: u64 = 500;
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
/// it itself.
pub enum InitialPidFile<'t> {
/// Create a pidfile, to allow future CLI invocations to manipulate the process.
Create(&'t Path),
/// The process will create the pidfile itself, need to wait for that event.
Expect(&'t Path),
}
/// Start a background child process using the parameters given.
pub fn start_process<F, S: AsRef<OsStr>>(
process_name: &str,
datadir: &Path,
command: &Path,
args: &[S],
initial_pid_file: InitialPidFile,
process_status_check: F,
) -> anyhow::Result<Child>
where
F: Fn() -> anyhow::Result<bool>,
{
let log_path = datadir.join(format!("{process_name}.log"));
let process_log_file = fs::OpenOptions::new()
.create(true)
.write(true)
.append(true)
.open(&log_path)
.with_context(|| {
format!("Could not open {process_name} log file {log_path:?} for writing")
})?;
let same_file_for_stderr = process_log_file.try_clone().with_context(|| {
format!("Could not reuse {process_name} log file {log_path:?} for writing stderr")
})?;
let mut command = Command::new(command);
let background_command = command
.stdout(process_log_file)
.stderr(same_file_for_stderr)
.args(args);
let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
let mut spawned_process = filled_cmd.spawn().with_context(|| {
format!("Could not spawn {process_name}, see console output and log files for details.")
})?;
let pid = spawned_process.id();
let pid = Pid::from_raw(
i32::try_from(pid)
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
);
let pid_file_to_check = match initial_pid_file {
InitialPidFile::Create(target_pid_file_path) => {
match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
lock_file::LockCreationResult::Created { .. } => {
// We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
// as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
}
lock_file::LockCreationResult::AlreadyLocked { .. } => {
anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
}
lock_file::LockCreationResult::CreationFailed(e) => {
return Err(e.context(format!(
"Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
)))
}
}
None
}
InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
};
for retries in 0..RETRIES {
match process_started(pid, pid_file_to_check, &process_status_check) {
Ok(true) => {
println!("\n{process_name} started, pid: {pid}");
return Ok(spawned_process);
}
Ok(false) => {
if retries < 5 {
print!(".");
io::stdout().flush().unwrap();
} else {
if retries == 5 {
println!() // put a line break after dots for second message
}
println!("{process_name} has not started yet, retrying ({retries})...");
}
thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
}
Err(e) => {
println!("{process_name} failed to start: {e:#}");
if let Err(e) = spawned_process.kill() {
println!("Could not stop {process_name} subprocess: {e:#}")
};
return Err(e);
}
}
}
anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
}
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
if !pid_file.exists() {
println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
return Ok(());
}
let pid = read_pidfile(pid_file)?;
let sig = if immediate {
print!("Stopping {process_name} with pid {pid} immediately..");
Signal::SIGQUIT
} else {
print!("Stopping {process_name} with pid {pid} gracefully..");
Signal::SIGTERM
};
io::stdout().flush().unwrap();
match kill(pid, sig) {
Ok(()) => (),
Err(Errno::ESRCH) => {
println!(
"{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
);
return Ok(());
}
Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"),
}
// Wait until process is gone
for _ in 0..RETRIES {
match process_has_stopped(pid) {
Ok(true) => {
println!("\n{process_name} stopped");
if let Err(e) = fs::remove_file(pid_file) {
if e.kind() != io::ErrorKind::NotFound {
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
}
}
return Ok(());
}
Ok(false) => {
print!(".");
io::stdout().flush().unwrap();
thread::sleep(Duration::from_secs(1))
}
Err(e) => {
println!("{process_name} with pid {pid} failed to stop: {e:#}");
return Err(e);
}
}
}
anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
}
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
let var = "LLVM_PROFILE_FILE";
if let Some(val) = std::env::var_os(var) {
filled_cmd = filled_cmd.env(var, val);
}
const RUST_LOG_KEY: &str = "RUST_LOG";
if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
filled_cmd.env(RUST_LOG_KEY, rust_log_value)
} else {
filled_cmd
}
}
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
for env_key in [
"AWS_ACCESS_KEY_ID",
"AWS_SECRET_ACCESS_KEY",
"AWS_SESSION_TOKEN",
] {
if let Ok(value) = std::env::var(env_key) {
cmd = cmd.env(env_key, value);
}
}
cmd
}
fn process_started<F>(
pid: Pid,
pid_file_to_check: Option<&Path>,
status_check: &F,
) -> anyhow::Result<bool>
where
F: Fn() -> anyhow::Result<bool>,
{
match status_check() {
Ok(true) => match pid_file_to_check {
Some(pid_file_path) => {
if pid_file_path.exists() {
let pid_in_file = read_pidfile(pid_file_path)?;
Ok(pid_in_file == pid)
} else {
Ok(false)
}
}
None => Ok(true),
},
Ok(false) => Ok(false),
Err(e) => anyhow::bail!("process failed to start: {e}"),
}
}
/// Read a PID file
///
/// We expect a file that contains a single integer.
fn read_pidfile(pidfile: &Path) -> Result<Pid> {
let pid_str = fs::read_to_string(pidfile)
.with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
let pid: i32 = pid_str
.parse()
.map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
if pid < 1 {
bail!("pidfile {pidfile:?} contained bad value '{pid}'");
}
Ok(Pid::from_raw(pid))
}
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
match kill(pid, None) {
// Process exists, keep waiting
Ok(_) => Ok(false),
// Process not found, we're done
Err(Errno::ESRCH) => Ok(true),
Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"),
}
}

View File

@@ -9,8 +9,8 @@ use anyhow::{anyhow, bail, Context, Result};
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
use control_plane::compute::ComputeControlPlane;
use control_plane::local_env::{EtcdBroker, LocalEnv};
use control_plane::pageserver::PageServerNode;
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage::PageServerNode;
use control_plane::{etcd, local_env};
use pageserver_api::models::TimelineInfo;
use pageserver_api::{

View File

@@ -12,14 +12,15 @@ use std::time::Duration;
use anyhow::{Context, Result};
use utils::{
connstring::connection_host_port,
id::{TenantId, TimelineId},
lsn::Lsn,
postgres_backend::AuthType,
};
use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
use crate::pageserver::PageServerNode;
use crate::postgresql_conf::PostgresConf;
use crate::storage::PageServerNode;
//
// ComputeControlPlane
@@ -182,18 +183,18 @@ impl PostgresNode {
}
fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
let pg_path = self.env.pg_bin_dir(pg_version).join("postgres");
let mut cmd = Command::new(&pg_path);
cmd.arg("--sync-safekeepers")
.env_clear()
.env(
"LD_LIBRARY_PATH",
self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
self.env.pg_lib_dir(pg_version).to_str().unwrap(),
)
.env(
"DYLD_LIBRARY_PATH",
self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
self.env.pg_lib_dir(pg_version).to_str().unwrap(),
)
.env("PGDATA", self.pgdata().to_str().unwrap())
.stdout(Stdio::piped())
@@ -281,7 +282,9 @@ impl PostgresNode {
fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
let mut conf = PostgresConf::new();
conf.append("max_wal_senders", "10");
conf.append("wal_log_hints", "off");
// wal_log_hints is mandatory when running against pageserver (see gh issue#192)
// TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
conf.append("wal_log_hints", "on");
conf.append("max_replication_slots", "10");
conf.append("hot_standby", "on");
conf.append("shared_buffers", "1MB");
@@ -299,8 +302,7 @@ impl PostgresNode {
// Configure the node to fetch pages from pageserver
let pageserver_connstr = {
let config = &self.pageserver.pg_connection_config;
let (host, port) = (config.host(), config.port());
let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
// Set up authentication
//
@@ -420,7 +422,7 @@ impl PostgresNode {
}
fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl");
let pg_ctl_path = self.env.pg_bin_dir(self.pg_version).join("pg_ctl");
let mut cmd = Command::new(pg_ctl_path);
cmd.args(
[
@@ -438,11 +440,11 @@ impl PostgresNode {
.env_clear()
.env(
"LD_LIBRARY_PATH",
self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
self.env.pg_lib_dir(self.pg_version).to_str().unwrap(),
)
.env(
"DYLD_LIBRARY_PATH",
self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
self.env.pg_lib_dir(self.pg_version).to_str().unwrap(),
);
if let Some(token) = auth_token {
cmd.env("ZENITH_AUTH_TOKEN", token);

View File

@@ -1,57 +0,0 @@
use url::Url;
#[derive(Debug)]
pub struct PgConnectionConfig {
url: Url,
}
impl PgConnectionConfig {
pub fn host(&self) -> &str {
self.url.host_str().expect("BUG: no host")
}
pub fn port(&self) -> u16 {
self.url.port().expect("BUG: no port")
}
/// Return a `<host>:<port>` string.
pub fn raw_address(&self) -> String {
format!("{}:{}", self.host(), self.port())
}
/// Connect using postgres protocol with TLS disabled.
pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
postgres::Client::connect(self.url.as_str(), postgres::NoTls)
}
}
impl std::str::FromStr for PgConnectionConfig {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut url: Url = s.parse()?;
match url.scheme() {
"postgres" | "postgresql" => {}
other => anyhow::bail!("invalid scheme: {other}"),
}
// It's not a valid connection url if host is unavailable.
if url.host().is_none() {
anyhow::bail!(url::ParseError::EmptyHost);
}
// E.g. `postgres:bar`.
if url.cannot_be_a_base() {
anyhow::bail!("URL cannot be a base");
}
// Set the default PG port if it's missing.
if url.port().is_none() {
url.set_port(Some(5432))
.expect("BUG: couldn't set the default port");
}
Ok(Self { url })
}
}

View File

@@ -1,75 +1,95 @@
use std::{fs, path::PathBuf};
use std::{
fs,
path::PathBuf,
process::{Command, Stdio},
};
use anyhow::Context;
use nix::{
sys::signal::{kill, Signal},
unistd::Pid,
};
use crate::{background_process, local_env};
use crate::{local_env, read_pidfile};
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let etcd_broker = &env.etcd_broker;
println!(
"Starting etcd broker using {:?}",
etcd_broker.etcd_binary_path
"Starting etcd broker using {}",
etcd_broker.etcd_binary_path.display()
);
let etcd_data_dir = env.base_data_dir.join("etcd");
fs::create_dir_all(&etcd_data_dir)
.with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;
fs::create_dir_all(&etcd_data_dir).with_context(|| {
format!(
"Failed to create etcd data dir: {}",
etcd_data_dir.display()
)
})?;
let etcd_stdout_file =
fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
format!(
"Failed to create etcd stout file in directory {}",
etcd_data_dir.display()
)
})?;
let etcd_stderr_file =
fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
format!(
"Failed to create etcd stderr file in directory {}",
etcd_data_dir.display()
)
})?;
let client_urls = etcd_broker.comma_separated_endpoints();
let args = [
format!("--data-dir={}", etcd_data_dir.display()),
format!("--listen-client-urls={client_urls}"),
format!("--advertise-client-urls={client_urls}"),
// Set --quota-backend-bytes to keep the etcd virtual memory
// size smaller. Our test etcd clusters are very small.
// See https://github.com/etcd-io/etcd/issues/7910
"--quota-backend-bytes=100000000".to_string(),
// etcd doesn't compact (vacuum) with default settings,
// enable it to prevent space exhaustion.
"--auto-compaction-mode=revision".to_string(),
"--auto-compaction-retention=1".to_string(),
];
let pid_file_path = etcd_pid_file_path(env);
let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
.args(&[
format!("--data-dir={}", etcd_data_dir.display()),
format!("--listen-client-urls={client_urls}"),
format!("--advertise-client-urls={client_urls}"),
// Set --quota-backend-bytes to keep the etcd virtual memory
// size smaller. Our test etcd clusters are very small.
// See https://github.com/etcd-io/etcd/issues/7910
"--quota-backend-bytes=100000000".to_string(),
])
.stdout(Stdio::from(etcd_stdout_file))
.stderr(Stdio::from(etcd_stderr_file))
.spawn()
.context("Failed to spawn etcd subprocess")?;
let pid = etcd_process.id();
let client = reqwest::blocking::Client::new();
background_process::start_process(
"etcd",
&etcd_data_dir,
&etcd_broker.etcd_binary_path,
&args,
background_process::InitialPidFile::Create(&pid_file_path),
|| {
for broker_endpoint in &etcd_broker.broker_endpoints {
let request = broker_endpoint
.join("health")
.with_context(|| {
format!(
"Failed to append /health path to broker endopint {}",
broker_endpoint
)
})
.and_then(|url| {
client.get(&url.to_string()).build().with_context(|| {
format!("Failed to construct request to etcd endpoint {url}")
})
})?;
if client.execute(request).is_ok() {
return Ok(true);
}
}
Ok(false)
},
)
.context("Failed to spawn etcd subprocess")?;
let etcd_pid_file_path = etcd_pid_file_path(env);
fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
format!(
"Failed to create etcd pid file at {}",
etcd_pid_file_path.display()
)
})?;
Ok(())
}
pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
let etcd_path = &env.etcd_broker.etcd_binary_path;
println!("Stopping etcd broker at {}", etcd_path.display());
let etcd_pid_file_path = etcd_pid_file_path(env);
let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
format!(
"Failed to read etcd pid file at {}",
etcd_pid_file_path.display()
)
})?);
kill(pid, Signal::SIGTERM).with_context(|| {
format!(
"Failed to stop etcd with pid {pid} at {}",
etcd_pid_file_path.display()
)
})?;
Ok(())
}
fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {

View File

@@ -6,12 +6,59 @@
// Intended to be used in integration tests and in CLI tools for
// local installations.
//
use anyhow::{anyhow, bail, Context, Result};
use std::fs;
use std::path::Path;
use std::process::Command;
mod background_process;
pub mod compute;
pub mod connection;
pub mod etcd;
pub mod local_env;
pub mod pageserver;
pub mod postgresql_conf;
pub mod safekeeper;
pub mod storage;
/// Read a PID file
///
/// We expect a file that contains a single integer.
/// We return an i32 for compatibility with libc and nix.
pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
let pid_str = fs::read_to_string(pidfile)
.with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
let pid: i32 = pid_str
.parse()
.map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
if pid < 1 {
bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
}
Ok(pid)
}
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
let var = "LLVM_PROFILE_FILE";
if let Some(val) = std::env::var_os(var) {
cmd.env(var, val);
}
const RUST_LOG_KEY: &str = "RUST_LOG";
if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
cmd.env(RUST_LOG_KEY, rust_log_value)
} else {
cmd
}
}
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
for env_key in [
"AWS_ACCESS_KEY_ID",
"AWS_SECRET_ACCESS_KEY",
"AWS_SESSION_TOKEN",
] {
if let Ok(value) = std::env::var(env_key) {
cmd = cmd.env(env_key, value);
}
}
cmd
}

View File

@@ -201,37 +201,37 @@ impl LocalEnv {
self.pg_distrib_dir.clone()
}
pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf {
let path = self.pg_distrib_dir.clone();
match pg_version {
14 => Ok(path.join(format!("v{pg_version}"))),
15 => Ok(path.join(format!("v{pg_version}"))),
_ => bail!("Unsupported postgres version: {}", pg_version),
14 => path.join(format!("v{pg_version}")),
15 => path.join(format!("v{pg_version}")),
_ => panic!("Unsupported postgres version: {}", pg_version),
}
}
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf {
match pg_version {
14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
_ => bail!("Unsupported postgres version: {}", pg_version),
14 => self.pg_distrib_dir(pg_version).join("bin"),
15 => self.pg_distrib_dir(pg_version).join("bin"),
_ => panic!("Unsupported postgres version: {}", pg_version),
}
}
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf {
match pg_version {
14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
_ => bail!("Unsupported postgres version: {}", pg_version),
14 => self.pg_distrib_dir(pg_version).join("lib"),
15 => self.pg_distrib_dir(pg_version).join("lib"),
_ => panic!("Unsupported postgres version: {}", pg_version),
}
}
pub fn pageserver_bin(&self) -> PathBuf {
self.neon_distrib_dir.join("pageserver")
pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
Ok(self.neon_distrib_dir.join("pageserver"))
}
pub fn safekeeper_bin(&self) -> PathBuf {
self.neon_distrib_dir.join("safekeeper")
pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
Ok(self.neon_distrib_dir.join("safekeeper"))
}
pub fn pg_data_dirs_path(&self) -> PathBuf {
@@ -422,10 +422,10 @@ impl LocalEnv {
"directory '{}' already exists. Perhaps already initialized?",
base_path.display()
);
if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
if !self.pg_bin_dir(pg_version).join("postgres").exists() {
bail!(
"Can't find postgres binary at {}",
self.pg_bin_dir(pg_version)?.display()
self.pg_bin_dir(pg_version).display()
);
}
for binary in ["pageserver", "safekeeper"] {

View File

@@ -1,21 +1,23 @@
use std::io::Write;
use std::path::PathBuf;
use std::process::Child;
use std::process::Command;
use std::sync::Arc;
use std::{io, result};
use std::time::Duration;
use std::{io, result, thread};
use anyhow::Context;
use anyhow::bail;
use nix::errno::Errno;
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use postgres::Config;
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use utils::{http::error::HttpErrorBody, id::NodeId};
use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};
use crate::connection::PgConnectionConfig;
use crate::pageserver::PageServerNode;
use crate::{
background_process,
local_env::{LocalEnv, SafekeeperConf},
};
use crate::local_env::{LocalEnv, SafekeeperConf};
use crate::storage::PageServerNode;
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
#[derive(Error, Debug)]
pub enum SafekeeperHttpError {
@@ -61,7 +63,7 @@ pub struct SafekeeperNode {
pub conf: SafekeeperConf,
pub pg_connection_config: PgConnectionConfig,
pub pg_connection_config: Config,
pub env: LocalEnv,
pub http_client: Client,
pub http_base_url: String,
@@ -85,15 +87,15 @@ impl SafekeeperNode {
}
/// Construct libpq connection string for connecting to this safekeeper.
fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
fn safekeeper_connection_config(port: u16) -> Config {
// TODO safekeeper authentication not implemented yet
format!("postgresql://no_user@127.0.0.1:{port}/no_db")
format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
.parse()
.unwrap()
}
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
env.safekeeper_data_dir(&format!("sk{sk_id}"))
env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
}
pub fn datadir_path(&self) -> PathBuf {
@@ -104,78 +106,92 @@ impl SafekeeperNode {
self.datadir_path().join("safekeeper.pid")
}
pub fn start(&self) -> anyhow::Result<Child> {
pub fn start(&self) -> anyhow::Result<()> {
print!(
"Starting safekeeper at '{}' in '{}'",
self.pg_connection_config.raw_address(),
connection_address(&self.pg_connection_config),
self.datadir_path().display()
);
io::stdout().flush().unwrap();
let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
let id = self.id;
let datadir = self.datadir_path();
let id_string = id.to_string();
let mut args = vec![
"-D",
datadir.to_str().with_context(|| {
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
})?,
"--id",
&id_string,
"--listen-pg",
&listen_pg,
"--listen-http",
&listen_http,
];
let mut cmd = Command::new(self.env.safekeeper_bin()?);
fill_rust_env_vars(
cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
.args(&["--id", self.id.to_string().as_ref()])
.args(&["--listen-pg", &listen_pg])
.args(&["--listen-http", &listen_http])
.args(&["--recall", "1 second"])
.arg("--daemonize"),
);
if !self.conf.sync {
args.push("--no-sync");
cmd.arg("--no-sync");
}
let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
if !comma_separated_endpoints.is_empty() {
args.extend(["--broker-endpoints", &comma_separated_endpoints]);
cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
}
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
args.extend(["--broker-etcd-prefix", prefix]);
cmd.args(&["--broker-etcd-prefix", prefix]);
}
let mut backup_threads = String::new();
if let Some(threads) = self.conf.backup_threads {
backup_threads = threads.to_string();
args.extend(["--backup-threads", &backup_threads]);
} else {
drop(backup_threads);
cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
}
if let Some(ref remote_storage) = self.conf.remote_storage {
args.extend(["--remote-storage", remote_storage]);
cmd.args(&["--remote-storage", remote_storage]);
}
let key_path = self.env.base_data_dir.join("auth_public_key.pem");
if self.conf.auth_enabled {
args.extend([
"--auth-validation-public-key-path",
key_path.to_str().with_context(|| {
format!("Key path {key_path:?} cannot be represented as a unicode string")
})?,
]);
cmd.arg("--auth-validation-public-key-path");
// PathBuf is better be passed as is, not via `String`.
cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
}
background_process::start_process(
&format!("safekeeper {id}"),
&datadir,
&self.env.safekeeper_bin(),
&args,
background_process::InitialPidFile::Expect(&self.pid_file()),
|| match self.check_status() {
Ok(()) => Ok(true),
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
},
)
fill_aws_secrets_vars(&mut cmd);
if !cmd.status()?.success() {
bail!(
"Safekeeper failed to start. See '{}' for details.",
self.datadir_path().join("safekeeper.log").display()
);
}
// It takes a while for the safekeeper to start up. Wait until it is
// open for business.
const RETRIES: i8 = 15;
for retries in 1..RETRIES {
match self.check_status() {
Ok(_) => {
println!("\nSafekeeper started");
return Ok(());
}
Err(err) => {
match err {
SafekeeperHttpError::Transport(err) => {
if err.is_connect() && retries < 5 {
print!(".");
io::stdout().flush().unwrap();
} else {
if retries == 5 {
println!() // put a line break after dots for second message
}
println!(
"Safekeeper not responding yet, err {} retrying ({})...",
err, retries
);
}
}
SafekeeperHttpError::Response(msg) => {
bail!("safekeeper failed to start: {} ", msg)
}
}
thread::sleep(Duration::from_secs(1));
}
}
}
bail!("safekeeper failed to start in {} seconds", RETRIES);
}
///
@@ -187,11 +203,63 @@ impl SafekeeperNode {
/// If the server is not running, returns success
///
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
background_process::stop_process(
immediate,
&format!("safekeeper {}", self.id),
&self.pid_file(),
)
let pid_file = self.pid_file();
if !pid_file.exists() {
println!("Safekeeper {} is already stopped", self.id);
return Ok(());
}
let pid = read_pidfile(&pid_file)?;
let pid = Pid::from_raw(pid);
let sig = if immediate {
print!("Stopping safekeeper {} immediately..", self.id);
Signal::SIGQUIT
} else {
print!("Stopping safekeeper {} gracefully..", self.id);
Signal::SIGTERM
};
io::stdout().flush().unwrap();
match kill(pid, sig) {
Ok(_) => (),
Err(Errno::ESRCH) => {
println!(
"Safekeeper with pid {} does not exist, but a PID file was found",
pid
);
return Ok(());
}
Err(err) => bail!(
"Failed to send signal to safekeeper with pid {}: {}",
pid,
err.desc()
),
}
// Wait until process is gone
for i in 0..600 {
let signal = None; // Send no signal, just get the error code
match kill(pid, signal) {
Ok(_) => (), // Process exists, keep waiting
Err(Errno::ESRCH) => {
// Process not found, we're done
println!("done!");
return Ok(());
}
Err(err) => bail!(
"Failed to send signal to pageserver with pid {}: {}",
pid,
err.desc()
),
};
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(100));
}
bail!("Failed to stop safekeeper with pid {}", pid);
}
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {

View File

@@ -1,27 +1,33 @@
use std::collections::HashMap;
use std::fs::{self, File};
use std::fs::File;
use std::io::{BufReader, Write};
use std::num::NonZeroU64;
use std::path::{Path, PathBuf};
use std::process::Child;
use std::{io, result};
use std::process::Command;
use std::time::Duration;
use std::{io, result, thread};
use crate::connection::PgConnectionConfig;
use anyhow::{bail, Context};
use nix::errno::Errno;
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
};
use postgres::{Config, NoTls};
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use utils::{
connstring::connection_address,
http::error::HttpErrorBody,
id::{TenantId, TimelineId},
lsn::Lsn,
postgres_backend::AuthType,
};
use crate::{background_process, local_env::LocalEnv};
use crate::local_env::LocalEnv;
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
#[derive(Error, Debug)]
pub enum PageserverHttpError {
@@ -69,7 +75,7 @@ impl ResponseErrorMessageExt for Response {
//
#[derive(Debug)]
pub struct PageServerNode {
pub pg_connection_config: PgConnectionConfig,
pub pg_connection_config: Config,
pub env: LocalEnv,
pub http_client: Client,
pub http_base_url: String,
@@ -95,7 +101,7 @@ impl PageServerNode {
}
/// Construct libpq connection string for connecting to the pageserver.
fn pageserver_connection_config(password: &str, listen_addr: &str) -> PgConnectionConfig {
fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
format!("postgresql://no_user:{password}@{listen_addr}/no_db")
.parse()
.unwrap()
@@ -155,15 +161,7 @@ impl PageServerNode {
init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
}
let mut pageserver_process = self
.start_node(&init_config_overrides, &self.env.base_data_dir, true)
.with_context(|| {
format!(
"Failed to start a process for pageserver {}",
self.env.pageserver.id,
)
})?;
self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?;
let init_result = self
.try_init_timeline(create_tenant, initial_timeline_id, pg_version)
.context("Failed to create initial tenant and timeline for pageserver");
@@ -173,29 +171,7 @@ impl PageServerNode {
}
Err(e) => eprintln!("{e:#}"),
}
match pageserver_process.kill() {
Err(e) => {
eprintln!(
"Failed to stop pageserver {} process with pid {}: {e:#}",
self.env.pageserver.id,
pageserver_process.id(),
)
}
Ok(()) => {
println!(
"Stopped pageserver {} process with pid {}",
self.env.pageserver.id,
pageserver_process.id(),
);
// cleanup after pageserver startup, since we do not call regular `stop_process` during init
let pid_file = self.pid_file();
if let Err(e) = fs::remove_file(&pid_file) {
if e.kind() != io::ErrorKind::NotFound {
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
}
}
}
}
self.stop(false)?;
init_result
}
@@ -220,14 +196,11 @@ impl PageServerNode {
self.env.pageserver_data_dir()
}
/// The pid file is created by the pageserver process, with its pid stored inside.
/// Other pageservers cannot lock the same file and overwrite it for as long as the current
/// pageserver runs. (Unless someone removes the file manually; never do that!)
fn pid_file(&self) -> PathBuf {
pub fn pid_file(&self) -> PathBuf {
self.repo_path().join("pageserver.pid")
}
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
self.start_node(config_overrides, &self.repo_path(), false)
}
@@ -236,10 +209,10 @@ impl PageServerNode {
config_overrides: &[&str],
datadir: &Path,
update_config: bool,
) -> anyhow::Result<Child> {
) -> anyhow::Result<()> {
println!(
"Starting pageserver at '{}' in '{}'",
self.pg_connection_config.raw_address(),
connection_address(&self.pg_connection_config),
datadir.display()
);
io::stdout().flush()?;
@@ -247,7 +220,10 @@ impl PageServerNode {
let mut args = vec![
"-D",
datadir.to_str().with_context(|| {
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
format!(
"Datadir path '{}' cannot be represented as a unicode string",
datadir.display()
)
})?,
];
@@ -259,18 +235,48 @@ impl PageServerNode {
args.extend(["-c", config_override]);
}
background_process::start_process(
"pageserver",
datadir,
&self.env.pageserver_bin(),
&args,
background_process::InitialPidFile::Expect(&self.pid_file()),
|| match self.check_status() {
Ok(()) => Ok(true),
Err(PageserverHttpError::Transport(_)) => Ok(false),
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
},
)
let mut cmd = Command::new(self.env.pageserver_bin()?);
let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
filled_cmd = fill_aws_secrets_vars(filled_cmd);
if !filled_cmd.status()?.success() {
bail!(
"Pageserver failed to start. See console output and '{}' for details.",
datadir.join("pageserver.log").display()
);
}
// It takes a while for the page server to start up. Wait until it is
// open for business.
const RETRIES: i8 = 15;
for retries in 1..RETRIES {
match self.check_status() {
Ok(()) => {
println!("\nPageserver started");
return Ok(());
}
Err(err) => {
match err {
PageserverHttpError::Transport(err) => {
if err.is_connect() && retries < 5 {
print!(".");
io::stdout().flush().unwrap();
} else {
if retries == 5 {
println!() // put a line break after dots for second message
}
println!("Pageserver not responding yet, err {err} retrying ({retries})...");
}
}
PageserverHttpError::Response(msg) => {
bail!("pageserver failed to start: {msg} ")
}
}
thread::sleep(Duration::from_secs(1));
}
}
}
bail!("pageserver failed to start in {RETRIES} seconds");
}
///
@@ -282,18 +288,69 @@ impl PageServerNode {
/// If the server is not running, returns success
///
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
background_process::stop_process(immediate, "pageserver", &self.pid_file())
let pid_file = self.pid_file();
if !pid_file.exists() {
println!("Pageserver is already stopped");
return Ok(());
}
let pid = Pid::from_raw(read_pidfile(&pid_file)?);
let sig = if immediate {
print!("Stopping pageserver immediately..");
Signal::SIGQUIT
} else {
print!("Stopping pageserver gracefully..");
Signal::SIGTERM
};
io::stdout().flush().unwrap();
match kill(pid, sig) {
Ok(_) => (),
Err(Errno::ESRCH) => {
println!("Pageserver with pid {pid} does not exist, but a PID file was found");
return Ok(());
}
Err(err) => bail!(
"Failed to send signal to pageserver with pid {pid}: {}",
err.desc()
),
}
// Wait until process is gone
for i in 0..600 {
let signal = None; // Send no signal, just get the error code
match kill(pid, signal) {
Ok(_) => (), // Process exists, keep waiting
Err(Errno::ESRCH) => {
// Process not found, we're done
println!("done!");
return Ok(());
}
Err(err) => bail!(
"Failed to send signal to pageserver with pid {}: {}",
pid,
err.desc()
),
};
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(100));
}
bail!("Failed to stop pageserver with pid {pid}");
}
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
let mut client = self.pg_connection_config.connect_no_tls().unwrap();
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
println!("Pageserver query: '{sql}'");
client.simple_query(sql).unwrap()
}
pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
self.pg_connection_config.connect_no_tls()
self.pg_connection_config.connect(NoTls)
}
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
@@ -362,11 +419,6 @@ impl PageServerNode {
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
trace_read_requests: settings
.remove("trace_read_requests")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'trace_read_requests' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -429,11 +481,6 @@ impl PageServerNode {
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
trace_read_requests: settings
.get("trace_read_requests")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'trace_read_requests' as bool")?,
})
.send()?
.error_from_body()?;
@@ -502,7 +549,7 @@ impl PageServerNode {
pg_wal: Option<(Lsn, PathBuf)>,
pg_version: u32,
) -> anyhow::Result<()> {
let mut client = self.pg_connection_config.connect_no_tls().unwrap();
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
// Init base reader
let (start_lsn, base_tarfile_path) = base;

View File

@@ -1,13 +0,0 @@
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG COMPUTE_IMAGE=compute-node-v14
ARG TAG=latest
FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
USER root
RUN apt-get update && \
apt-get install -y curl \
jq \
netcat
USER postgres

View File

@@ -1,48 +0,0 @@
#!/bin/bash
set -eux
PG_VERSION=${PG_VERSION:-14}
SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
SPEC_FILE=/tmp/spec.json
echo "Waiting pageserver become ready."
while ! nc -z pageserver 6400; do
sleep 1;
done
echo "Page server is ready."
echo "Create a tenant and timeline"
PARAMS=(
-sb
-X POST
-H "Content-Type: application/json"
-d "{}"
http://pageserver:9898/v1/tenant/
)
tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
PARAMS=(
-sb
-X POST
-H "Content-Type: application/json"
-d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
)
result=$(curl "${PARAMS[@]}")
echo $result | jq .
echo "Overwrite tenant id and timeline id in spec file"
tenant_id=$(echo ${result} | jq -r .tenant_id)
timeline_id=$(echo ${result} | jq -r .timeline_id)
sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
cat ${SPEC_FILE}
echo "Start compute node"
/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
-C "postgresql://cloud_admin@localhost:55433/postgres" \
-b /usr/local/bin/postgres \
-S ${SPEC_FILE}

View File

@@ -1,141 +0,0 @@
{
"format_version": 1.0,
"timestamp": "2022-10-12T18:00:00.000Z",
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
"cluster": {
"cluster_id": "docker_compose",
"name": "docker_compose_test",
"state": "restarted",
"roles": [
{
"name": "cloud_admin",
"encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
"options": null
}
],
"databases": [
],
"settings": [
{
"name": "fsync",
"value": "off",
"vartype": "bool"
},
{
"name": "wal_level",
"value": "replica",
"vartype": "enum"
},
{
"name": "hot_standby",
"value": "on",
"vartype": "bool"
},
{
"name": "wal_log_hints",
"value": "on",
"vartype": "bool"
},
{
"name": "log_connections",
"value": "on",
"vartype": "bool"
},
{
"name": "port",
"value": "55433",
"vartype": "integer"
},
{
"name": "shared_buffers",
"value": "1MB",
"vartype": "string"
},
{
"name": "max_connections",
"value": "100",
"vartype": "integer"
},
{
"name": "listen_addresses",
"value": "0.0.0.0",
"vartype": "string"
},
{
"name": "max_wal_senders",
"value": "10",
"vartype": "integer"
},
{
"name": "max_replication_slots",
"value": "10",
"vartype": "integer"
},
{
"name": "wal_sender_timeout",
"value": "5s",
"vartype": "string"
},
{
"name": "wal_keep_size",
"value": "0",
"vartype": "integer"
},
{
"name": "password_encryption",
"value": "md5",
"vartype": "enum"
},
{
"name": "restart_after_crash",
"value": "off",
"vartype": "bool"
},
{
"name": "synchronous_standby_names",
"value": "walproposer",
"vartype": "string"
},
{
"name": "shared_preload_libraries",
"value": "neon",
"vartype": "string"
},
{
"name": "neon.safekeepers",
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
"vartype": "string"
},
{
"name": "neon.timeline_id",
"value": "TIMELINE_ID",
"vartype": "string"
},
{
"name": "neon.tenant_id",
"value": "TENANT_ID",
"vartype": "string"
},
{
"name": "neon.pageserver_connstring",
"value": "host=pageserver port=6400",
"vartype": "string"
},
{
"name": "max_replication_write_lag",
"value": "500MB",
"vartype": "string"
},
{
"name": "max_replication_flush_lag",
"value": "10GB",
"vartype": "string"
}
]
},
"delta_operations": [
]
}

View File

@@ -1,209 +0,0 @@
version: '3'
services:
etcd:
restart: always
image: quay.io/coreos/etcd:v3.5.4
ports:
- 2379:2379
- 2380:2380
environment:
# This signifficantly speeds up etcd and we anyway don't data persistency there.
ETCD_UNSAFE_NO_FSYNC: "1"
command:
- "etcd"
- "--auto-compaction-mode=revision"
- "--auto-compaction-retention=1"
- "--name=etcd-cluster"
- "--initial-cluster-state=new"
- "--initial-cluster-token=etcd-cluster-1"
- "--initial-cluster=etcd-cluster=http://etcd:2380"
- "--initial-advertise-peer-urls=http://etcd:2380"
- "--advertise-client-urls=http://etcd:2379"
- "--listen-client-urls=http://0.0.0.0:2379"
- "--listen-peer-urls=http://0.0.0.0:2380"
- "--quota-backend-bytes=134217728" # 128 MB
minio:
restart: always
image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
ports:
- 9000:9000
- 9001:9001
environment:
- MINIO_ROOT_USER=minio
- MINIO_ROOT_PASSWORD=password
command: server /data --address :9000 --console-address ":9001"
minio_create_buckets:
image: minio/mc
environment:
- MINIO_ROOT_USER=minio
- MINIO_ROOT_PASSWORD=password
entrypoint:
- "/bin/sh"
- "-c"
command:
- "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
echo 'Waiting to start minio...' && sleep 1;
done;
/usr/bin/mc mb minio/neon --region=eu-north-1;
exit 0;"
depends_on:
- minio
pageserver:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
environment:
- BROKER_ENDPOINT='http://etcd:2379'
- AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1
ports:
#- 6400:6400 # pg protocol handler
- 9898:9898 # http endpoints
entrypoint:
- "/bin/sh"
- "-c"
command:
- "/usr/local/bin/pageserver -D /data/.neon/
-c \"broker_endpoints=[$$BROKER_ENDPOINT]\"
-c \"listen_pg_addr='0.0.0.0:6400'\"
-c \"listen_http_addr='0.0.0.0:9898'\"
-c \"remote_storage={endpoint='http://minio:9000',
bucket_name='neon',
bucket_region='eu-north-1',
prefix_in_bucket='/pageserver/'}\""
depends_on:
- etcd
- minio_create_buckets
safekeeper1:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
- SAFEKEEPER_ID=1
- BROKER_ENDPOINT=http://etcd:2379
- AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1
ports:
#- 5454:5454 # pg protocol handler
- 7676:7676 # http endpoints
entrypoint:
- "/bin/sh"
- "-c"
command:
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
--listen-http='0.0.0.0:7676'
--id=$$SAFEKEEPER_ID
--broker-endpoints=$$BROKER_ENDPOINT
-D /data
--remote-storage=\"{endpoint='http://minio:9000',
bucket_name='neon',
bucket_region='eu-north-1',
prefix_in_bucket='/safekeeper/'}\""
depends_on:
- etcd
- minio_create_buckets
safekeeper2:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
- SAFEKEEPER_ID=2
- BROKER_ENDPOINT=http://etcd:2379
- AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1
ports:
#- 5454:5454 # pg protocol handler
- 7677:7676 # http endpoints
entrypoint:
- "/bin/sh"
- "-c"
command:
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
--listen-http='0.0.0.0:7676'
--id=$$SAFEKEEPER_ID
--broker-endpoints=$$BROKER_ENDPOINT
-D /data
--remote-storage=\"{endpoint='http://minio:9000',
bucket_name='neon',
bucket_region='eu-north-1',
prefix_in_bucket='/safekeeper/'}\""
depends_on:
- etcd
- minio_create_buckets
safekeeper3:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
- SAFEKEEPER_ID=3
- BROKER_ENDPOINT=http://etcd:2379
- AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1
ports:
#- 5454:5454 # pg protocol handler
- 7678:7676 # http endpoints
entrypoint:
- "/bin/sh"
- "-c"
command:
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
--listen-http='0.0.0.0:7676'
--id=$$SAFEKEEPER_ID
--broker-endpoints=$$BROKER_ENDPOINT
-D /data
--remote-storage=\"{endpoint='http://minio:9000',
bucket_name='neon',
bucket_region='eu-north-1',
prefix_in_bucket='/safekeeper/'}\""
depends_on:
- etcd
- minio_create_buckets
compute:
restart: always
build:
context: ./compute_wrapper/
args:
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
- TAG=${TAG:-latest}
- http_proxy=$http_proxy
- https_proxy=$https_proxy
environment:
- PG_VERSION=${PG_VERSION:-14}
#- RUST_BACKTRACE=1
# Mount the test files directly, for faster editing cycle.
volumes:
- ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
- ./compute_wrapper/shell/:/shell/
ports:
- 55433:55433 # pg protocol handler
- 3080:3080 # http endpoints
entrypoint:
- "/shell/compute.sh"
depends_on:
- safekeeper1
- safekeeper2
- safekeeper3
- pageserver
compute_is_ready:
image: postgres:latest
entrypoint:
- "/bin/bash"
- "-c"
command:
- "until pg_isready -h compute -p 55433 ; do
echo 'Waiting to start compute...' && sleep 1;
done"
depends_on:
- compute

View File

@@ -1,60 +0,0 @@
#!/bin/bash
# A basic test to ensure Docker images are built correctly.
# Build a wrapper around the compute, start all services and runs a simple SQL query.
# Repeats the process for all currenly supported Postgres versions.
# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
# to verify custom image builds (e.g pre-published ones).
# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
set -eux -o pipefail
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
COMPUTE_CONTAINER_NAME=docker-compose-compute-1
SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
cleanup() {
echo "show container information"
docker ps
docker compose -f $COMPOSE_FILE logs
echo "stop containers..."
docker compose -f $COMPOSE_FILE down
}
echo "clean up containers if exists"
cleanup
for pg_version in 14 15; do
echo "start containers (pg_version=$pg_version)."
PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
echo "wait until the compute is ready. timeout after 60s. "
cnt=0
while sleep 1; do
# check timeout
cnt=`expr $cnt + 1`
if [ $cnt -gt 60 ]; then
echo "timeout before the compute is ready."
cleanup
exit 1
fi
# check if the compute is ready
set +o pipefail
result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
set -o pipefail
if [ $result -eq 1 ]; then
echo "OK. The compute is ready to connect."
echo "execute simple queries."
docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
cleanup
break
fi
done
done

View File

@@ -37,7 +37,7 @@
- [Source view](./sourcetree.md)
- [docker.md](./docker.md) — Docker images and building pipeline.
- [Error handling and logging](./error-handling.md)
- [Error handling and logging]()
- [Testing]()
- [Unit testing]()
- [Integration testing]()
@@ -80,6 +80,4 @@
- [015-storage-messaging](rfcs/015-storage-messaging.md)
- [016-connection-routing](rfcs/016-connection-routing.md)
- [017-timeline-data-management](rfcs/017-timeline-data-management.md)
- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
- [cluster-size-limits](rfcs/cluster-size-limits.md)

View File

@@ -18,67 +18,3 @@ We build all images after a successful `release` tests run and push automaticall
1. `neondatabase/compute-tools` and `neondatabase/compute-node`
2. `neondatabase/neon`
## Docker Compose example
You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
- etcd x 1
- pageserver x 1
- safekeeper x 3
- compute x 1
- MinIO x 1 # This is Amazon S3 compatible object storage
### How to use
1. create containers
You can specify version of neon cluster using following environment values.
- PG_VERSION: postgres version for compute (default is 14)
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
```
$ cd docker-compose/docker-compose.yml
$ docker-compose down # remove the conainers if exists
$ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version
Creating network "dockercompose_default" with the default driver
Creating dockercompose_etcd3_1 ...
(...omit...)
```
2. connect compute node
```
$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
$ psql -h localhost -p 55433 -U cloud_admin
postgres=# CREATE TABLE t(key int primary key, value text);
CREATE TABLE
postgres=# insert into t values(1,1);
INSERT 0 1
postgres=# select * from t;
key | value
-----+-------
1 | 1
(1 row)
```
3. If you want to see the log, you can use `docker-compose logs` command.
```
# check the container name you want to see
$ docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
d6968a5ae912 dockercompose_compute "/shell/compute.sh" 5 minutes ago Up 5 minutes 0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp dockercompose_compute_1
(...omit...)
$ docker logs -f dockercompose_compute_1
2022-10-21 06:15:48.757 GMT [56] LOG: connection authorized: user=cloud_admin database=postgres application_name=psql
2022-10-21 06:17:00.307 GMT [56] LOG: [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
(...omit...)
```
4. If you want to see durable data in MinIO which is s3 compatible storage
Access http://localhost:9001 and sign in.
- Username: `minio`
- Password: `password`
You can see durable pages and WAL data in `neon` bucket.

View File

@@ -1,198 +0,0 @@
# Error handling and logging
## Logging errors
The principle is that errors are logged when they are handled. If you
just propagate an error to the caller in a function, you don't need to
log it; the caller will. But if you consume an error in a function,
you *must* log it (if it needs to be logged at all).
For example:
```rust
fn read_motd_file() -> std::io::Result<String> {
let mut f = File::open("/etc/motd")?;
let mut result = String::new();
f.read_to_string(&mut result)?;
result
}
```
Opening or reading the file could fail, but there is no need to log
the error here. The function merely propagates the error to the
caller, and it is up to the caller to log the error or propagate it
further, if the failure is not expected. But if, for example, it is
normal that the "/etc/motd" file doesn't exist, the caller can choose
to silently ignore the error, or log it as an INFO or DEBUG level
message:
```rust
fn get_message_of_the_day() -> String {
// Get the motd from /etc/motd, or return the default proverb
match read_motd_file() {
Ok(motd) => motd,
Err(err) => {
// It's normal that /etc/motd doesn't exist, but if we fail to
// read it for some other reason, that's unexpected. The message
// of the day isn't very important though, so we just WARN and
// continue with the default in any case.
if err.kind() != std::io::ErrorKind::NotFound {
tracing::warn!("could not read \"/etc/motd\": {err:?}");
}
"An old error is always more popular than a new truth. - German proverb"
}
}
}
```
## Error types
We use the `anyhow` crate widely. It contains many convenient macros
like `bail!` and `ensure!` to construct and return errors, and to
propagate many kinds of low-level errors, wrapped in `anyhow::Error`.
A downside of `anyhow::Error` is that the caller cannot distinguish
between different error cases. Most errors are propagated all the way
to the mgmt API handler function, or the main loop that handles a
connection with the compute node, and they are all handled the same
way: the error is logged and returned to the client as an HTTP or
libpq error.
But in some cases, we need to distinguish between errors and handle
them differently. For example, attaching a tenant to the pageserver
could fail either because the tenant has already been attached, or
because we could not load its metadata from cloud storage. The first
case is more or less expected. The console sends the Attach request to
the pageserver, and the pageserver completes the operation, but the
network connection might be lost before the console receives the
response. The console will retry the operation in that case, but the
tenant has already been attached. It is important that the pagserver
responds with the HTTP 403 Already Exists error in that case, rather
than a generic HTTP 500 Internal Server Error.
If you need to distinguish between different kinds of errors, create a
new `Error` type. The `thiserror` crate is useful for that. But in
most cases `anyhow::Error` is good enough.
## Panics
Depending on where a panic happens, it can cause the whole pageserver
or safekeeper to restart, or just a single tenant. In either case,
that is pretty bad and causes an outage. Avoid panics. Never use
`unwrap()` or other calls that might panic, to verify inputs from the
network or from disk.
It is acceptable to use functions that might panic, like `unwrap()`, if
it is obvious that it cannot panic. For example, if you have just
checked that a variable is not None, it is OK to call `unwrap()` on it,
but it is still preferable to use `expect("reason")` instead to explain
why the function cannot fail.
`assert!` and `panic!` are reserved for checking clear invariants and
very obvious "can't happen" cases. When in doubt, use anyhow `ensure!`
or `bail!` instead.
## Error levels
`tracing::Level` doesn't provide very clear guidelines on what the
different levels mean, or when to use which level. Here is how we use
them:
### Error
Examples:
- could not open file "foobar"
- invalid tenant id
Errors are not expected to happen during normal operation. Incorrect
inputs from client can cause ERRORs. For example, if a client tries to
call a mgmt API that doesn't exist, or if a compute node sends passes
an LSN that has already been garbage collected away.
These should *not* happen during normal operations. "Normal
operations" is not a very precise concept. But for example, disk
errors are not expected to happen when the system is working, so those
count as Errors. However, if a TCP connection to a compute node is
lost, that is not considered an Error, because it doesn't affect the
pageserver's or safekeeper's operation in any way, and happens fairly
frequently when compute nodes are shut down, or are killed abruptly
because of errors in the compute.
**Errors are monitored, and always need human investigation to determine
the cause.**
Whether something should be logged at ERROR, WARNING or INFO level can
depend on the callers and clients. For example, it might be unexpected
and a sign of a serious issue if the console calls the
"timeline_detail" mgmt API for a timeline that doesn't exist. ERROR
would be appropriate in that case. But if the console routinely calls
the API after deleting a timeline, to check if the deletion has
completed, then it would be totally normal and an INFO or DEBUG level
message would be more appropriate. If a message is logged as an ERROR,
but it in fact happens frequently in production and never requires any
action, it should probably be demoted to an INFO level message.
### Warn
Examples:
- could not remove temporary file "foobar.temp"
- unrecognized file "foobar" in timeline directory
Warnings are similar to Errors, in that they should not happen
when the system is operating normally. The difference between Error and
Warning is that an Error means that the operation failed, whereas Warning
means that something unexpected happened, but the operation continued anyway.
For example, if deleting a file fails because the file already didn't exist,
it should be logged as Warning.
> **Note:** The python regression tests, under `test_regress`, check the
> pageserver log after each test for any ERROR and WARN lines. If there are
> any ERRORs or WARNs that have not been explicitly listed in the test as
> allowed, the test is marked a failed. This is to catch unexpected errors
> e.g. in background operations, that don't cause immediate misbehaviour in
> the tested functionality.
### Info
Info level is used to log useful information when the system is
operating normally. Info level is appropriate e.g. for logging state
changes, background operations, and network connections.
Examples:
- "system is shutting down"
- "tenant was created"
- "retrying S3 upload"
### Debug & Trace
Debug and Trace level messages are not printed to the log in our normal
production configuration, but could be enabled for a specific server or
tenant, to aid debugging. (Although we don't actually have that
capability as of this writing).
## Context
We use logging "spans" to hold context information about the current
operation. Almost every operation happens on a particular tenant and
timeline, so we enter a span with the "tenant_id" and "timeline_id"
very early when processing an incoming API request, for example. All
background operations should also run in a span containing at least
those two fields, and any other parameters or information that might
be useful when debugging an error that might happen when performing
the operation.
TODO: Spans are not captured in the Error when it is created, but when
the error is logged. It would be more useful to capture them at Error
creation. We should consider using `tracing_error::SpanTrace` to do
that.
## Error message style
PostgreSQL has a style guide for writing error messages:
https://www.postgresql.org/docs/current/error-style-guide.html
Follow that guide when writing error messages in the PostgreSQL
extension. We don't follow it strictly in the pageserver and
safekeeper, but the advice in the PostgreSQL style guide is generally
good, and you can't go wrong by following it.

View File

@@ -1,91 +0,0 @@
# Managing Tenant and Timeline lifecycles
## Summary
The pageserver has a Tenant object in memory for each tenant it manages, and a
Timeline for each timeline. There are a lot of tasks that operate on the tenants
and timelines with references to those objects. We have some mechanisms to track
which tasks are operating on each Tenant and Timeline, and to request them to
shutdown when a tenant or timeline is deleted, but it does not cover all uses,
and as a result we have many race conditions around tenant/timeline shutdown.
## Motivation
We have a bunch of race conditions that can produce weird errors and can be hard
to track down.
## Non Goals
This RFC only covers the problem of ensuring that a task/thread isn't operating
on a Tenant or Timeline. It does not cover what states, aside from Active and
non-Active, each Tenant and Timeline should have, or when exactly the transitions
should happen.
## Impacted components (e.g. pageserver, safekeeper, console, etc)
Pageserver. Although I wonder if the safekeeper should have a similar mechanism.
## Current situation
Most pageserver tasks of are managed by task_mgr.rs:
- LibpqEndpointListener
- HttpEndPointListener
- WalReceiverManager and -Connection
- GarbageCollector and Compaction
- InitialLogicalSizeCalculation
In addition to those tasks, the walreceiver performs some direct tokio::spawn
calls to spawn tasks that are not registered with 'task_mgr'. And all of these
tasks can spawn extra operations with tokio spawn_blocking.
Whenever a tenant or timeline is removed from the system, by pageserver
shutdown, delete_timeline or tenant-detach operation, we rely on the task
registry in 'task_mgr.rs' to wait until there are no tasks operating on the
tenant or timeline, before its Tenant/Timeline object is removed. That relies on
each task to register itself with the tenant/timeline ID in
'task_mgr.rs'. However, there are many gaps in that. For example,
GarbageCollection and Compaction tasks are registered with the tenant, but when
they proceed to operate on a particular timeline of the tenant, they don't
register with timeline ID. Because of that, the timeline can be deleted while GC
or compaction is running on it, causing failures in the GC or compaction (see
https://github.com/neondatabase/neon/issues/2442).
Another problem is that the task registry only works for tokio Tasks. There is
no way to register a piece of code that runs inside spawn_blocking(), for
example.
## Proposed implementation
This "voluntary" registration of tasks is fragile. Let's use Rust language features
to enforce that a tenant/timeline cannot be removed from the system when there is
still some code operating on it.
Let's introduce new Guard objects for Tenant and Timeline, and do all actions through
the Guard object. Something like:
TenantActiveGuard: Guard object over Arc<Tenant>. When you acquire the guard,
the code checks that the tenant is in Active state. If it's not, you get an
error. You can change the state of the tenant to Stopping while there are
ActiveTenantGuard objects still on it, to prevent new ActiveTenantGuards from
being acquired, but the Tenant cannot be removed until all the guards are gone.
TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the
tenant is not in Active state. Used for operations like attach/detach. Perhaps
allow only one such guard on a Tenant at a time.
Similarly for Timelines. We don't currentl have a "state" on Timeline, but I think
we need at least two states: Active and Stopping. The Stopping state is used at
deletion, to prevent new TimelineActiveGuards from appearing, while you wait for
existing TimelineActiveGuards to die out.
The shutdown-signaling, using shutdown_watcher() and is_shutdown_requested(),
probably also needs changes to deal with the new Guards. The rule is that if you
have a TenantActiveGuard, and the tenant's state changes from Active to
Stopping, the is_shutdown_requested() function should return true, and
shutdown_watcher() future should return.
This signaling doesn't neessarily need to cover all cases. For example, if you
have a block of code in spawn_blocking(), it might be acceptable if
is_shutdown_requested() doesn't return true even though the tenant is in
Stopping state, as long as the code finishes reasonably fast.

View File

@@ -1,246 +0,0 @@
# Coordinating access of multiple pageservers to the same s3 data
## Motivation
There are some blind spots around coordinating access of multiple pageservers
to the same s3 data. Currently this is applicable only to tenant relocation
case, but in the future we'll need to solve similar problems for
replica/standby pageservers.
## Impacted components (e.g. pageserver, safekeeper, console, etc)
Pageserver
## The problem
### Relocation
During relocation both pageservers can write to s3. This should be ok for all
data except the `index_part.json`. For index part it causes problems during
compaction/gc because they remove files from index/s3.
Imagine this case:
```mermaid
sequenceDiagram
autonumber
participant PS1
participant S3
participant PS2
PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
PS2->>S3: Attach called, sees L1, L2
PS1->>S3: Compaction comes <br/> Removes L1, adds L3
note over S3: Index now L2, L3
PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
note over S3: Index now L1, L2, L4
```
At this point it is not possible to restore from index, it contains L2 which
is no longer available in s3 and doesnt contain L3 added by compaction by the
first pageserver. So if any of the pageservers restart initial sync will fail
(or in on-demand world it will fail a bit later during page request from
missing layer)
### Standby pageserver
Another related case is standby pageserver. In this case second pageserver can
be used as a replica to scale reads and serve as a failover target in case
first one fails.
In this mode second pageserver needs to have the same picture of s3 files to
be able to load layers on-demand. To accomplish that second pageserver
cannot run gc/compaction jobs. Instead it needs to receive updates for index
contents. (There is no need to run walreceiver on the second pageserver then).
## Observations
- If both pageservers ingest wal then their layer set diverges, because layer
file generation is not deterministic
- If one of the pageservers does not ingest wal (and just picks up layer
updates) then it lags behind and cannot really answer queries in the same
pace as the primary one
- Can compaction help make layers deterministic? E g we do not upload level
zero layers and construction of higher levels should be deterministic.
This way we can guarantee that layer creation by timeout wont mess things up.
This way one pageserver uploads data and second one can just ingest it.
But we still need some form of election
## Solutions
### Manual orchestration
One possible solution for relocation case is to orchestrate background jobs
from outside. The oracle who runs migration can turn off background jobs on
PS1 before migration and then run migration -> enable them on PS2. The problem
comes if migration fails. In this case in order to resume background jobs
oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
without human ensuring that no upload from PS2 can happen. In order to be able
to resolve this automatically CAS is required on S3 side so pageserver can
avoid overwriting index part if it is no longer the leading one
Note that flag that disables background jobs needs to be persistent, because
otherwise pageserver restart will clean it
### Avoid index_part.json
Index part consists of two parts, list of layers and metadata. List of layers
can be easily obtained by `ListObjects` S3 API method. But what to do with
metadata? Create metadata instance for each checkpoint and add some counter
to the file name?
Back to potentially long s3 ls.
### Coordination based approach
Do it like safekeepers chose leader for WAL upload. Ping each other and decide
based on some heuristics e g smallest node id. During relocation PS1 sends
"resign" ping message so others can start election without waiting for a timeout.
This still leaves metadata question open and non deterministic layers are a
problem as well
### Avoid metadata file
One way to eliminate metadata file is to store it in layer files under some
special key. This may resonate with intention to keep all relation sizes in
some special segment to avoid initial download during size calculation.
Maybe with that we can even store pre calculated value.
As a downside each checkpoint gets 512 bytes larger.
If we entirely avoid metadata file this opens up many approaches
* * *
During discussion it seems that we converged on the approach consisting of:
- index files stored per pageserver in the same timeline directory. With that
index file name starts to look like: `<pageserver_node_id>_index_part.json`.
In such set up there are no concurrent overwrites of index file by different
pageservers.
- For replica pageservers the solution would be for primary to broadcast index
changes to any followers with an ability to check index files in s3 and
restore the full state. To properly merge changes with index files we can use
a counter that is persisted in an index file, is incremented on every change
to it and passed along with broadcasted change. This way we can determine
whether we need to apply change to the index state or not.
- Responsibility for running background jobs is assigned externally. Pageserver
keeps locally persistent flag for each tenant that indicates whether this
pageserver is considered as primary one or not. TODO what happends if we
crash and cannot start for some extended period of time? Control plane can
assign ownership to some other pageserver. Pageserver needs some way to check
if its still the blessed one. Maybe by explicit request to control plane on
start.
Requirement for deterministic layer generation was considered overly strict
because of two reasons:
- It can limit possible optimizations e g when pageserver wants to reshuffle
some data locally and doesnt want to coordinate this
- The deterministic algorithm itself can change so during deployments for some
time there will be two different version running at the same time which can
cause non determinism
### External elections
The above case with lost state in this schema with externally managed
leadership is represented like this:
Note that here we keep objects list in the index file.
```mermaid
sequenceDiagram
autonumber
participant PS1
participant CP as Control Plane
participant S3
participant PS2
note over PS1,PS2: PS1 starts up and still a leader
PS1->>CP: Am I still the leader for Tenant X?
activate CP
CP->>PS1: Yes
deactivate CP
PS1->>S3: Fetch PS1 index.
note over PS1: Continue operations, start backround jobs
note over PS1,PS2: PS1 starts up and still and is not a leader anymore
PS1->>CP: Am I still the leader for Tenant X?
CP->>PS1: No
PS1->>PS2: Subscribe to index changes
PS1->>S3: Fetch PS1 and PS2 indexes
note over PS1: Combine index file to include layers <br> from both indexes to be able <br> to see newer files from leader (PS2)
note over PS1: Continue operations, do not start background jobs
```
### Internal elections
To manage leadership internally we can use broker to exchange pings so nodes
can decide on the leader roles. In case multiple pageservers are active leader
is the one with lowest node id.
Operations with internally managed elections:
```mermaid
sequenceDiagram
autonumber
participant PS1
participant S3
note over PS1: Starts up
note over PS1: Subscribes to changes, waits for two ping <br> timeouts to see if there is a leader
PS1->>S3: Fetch indexes from s3
alt there is a leader
note over PS1: do not start background jobs, <br> continue applying index updates
else there is no leader
note over PS1: start background jobs, <br> broadcast index changes
end
note over PS1,S3: Then the picture is similar to external elections <br> the difference is that follower can become a leader <br> if there are no pings after some timeout new leader gets elected
```
### Eviction
When two pageservers operate on a tenant for extended period of time follower
doesnt perform write operations in s3. When layer is evicted follower relies
on updates from primary to get info about layers it needs to cover range for
evicted layer.
Note that it wont match evicted layer exactly, so layers will overlap and
lookup code needs to correctly handle that.
### Relocation flow
Actions become:
- Attach tenant to new pageserver
- New pageserver becomes follower since previous one is still leading
- New pageserver starts replicating from safekeepers but does not upload layers
- Detach is called on the old one
- New pageserver becomes leader after it realizes that old one disappeared
### Index File
Using `s3 ls` on startup simplifies things, but we still need metadata, so we
need to fetch index files anyway. If they contain list of files we can combine
them and avoid costly `s3 ls`
### Remaining issues
- More than one remote consistent lsn for safekeepers to know
Anything else?
### Proposed solution
To recap. On meeting we converged on approach with external elections but I
think it will be overall harder to manage and will introduce a dependency on
control plane for pageserver. Using separate index files for each pageserver
consisting of log of operations and a metadata snapshot should be enough.
### What we need to get there?
- Change index file structure to contain log of changes instead of just the
file list
- Implement pinging/elections for pageservers

View File

@@ -52,10 +52,6 @@ PostgreSQL extension that implements storage manager API and network communicati
PostgreSQL extension that contains functions needed for testing and debugging.
`/pgxn/neon_walredo`:
Library to run Postgres as a "WAL redo process" in the pageserver.
`/safekeeper`:
The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.

View File

@@ -29,9 +29,6 @@ pub struct SkTimelineInfo {
#[serde_as(as = "Option<DisplayFromStr>")]
#[serde(default)]
pub peer_horizon_lsn: Option<Lsn>,
#[serde_as(as = "Option<DisplayFromStr>")]
#[serde(default)]
pub local_start_lsn: Option<Lsn>,
/// A connection string to use for WAL receiving.
#[serde(default)]
pub safekeeper_connstr: Option<String>,

View File

@@ -7,10 +7,6 @@ edition = "2021"
serde = { version = "1.0", features = ["derive"] }
serde_with = "2.0"
const_format = "0.2.21"
anyhow = { version = "1.0", features = ["backtrace"] }
bytes = "1.0.1"
byteorder = "1.4.3"
utils = { path = "../utils" }
postgres_ffi = { path = "../postgres_ffi" }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -2,7 +2,6 @@ use const_format::formatcp;
/// Public API types
pub mod models;
pub mod reltag;
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");

View File

@@ -1,6 +1,5 @@
use std::num::NonZeroU64;
use byteorder::{BigEndian, ReadBytesExt};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::{
@@ -8,10 +7,6 @@ use utils::{
lsn::Lsn,
};
use crate::reltag::RelTag;
use anyhow::bail;
use bytes::{BufMut, Bytes, BytesMut};
/// A state of a tenant in pageserver's memory.
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum TenantState {
@@ -24,22 +19,6 @@ pub enum TenantState {
Broken,
}
/// A state of a timeline in pageserver's memory.
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum TimelineState {
/// Timeline is fully operational, its background jobs are running.
Active,
/// A timeline is recognized by pageserver, but not yet ready to operate.
/// The status indicates, that the timeline could eventually go back to Active automatically:
/// for example, if the owning tenant goes back to Active again.
Suspended,
/// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
/// automatically become Active after certain events: only a management call can change this status.
Paused,
/// A timeline is recognized by the pageserver, but no longer used for any operations, as failed to get activated.
Broken,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest {
@@ -73,7 +52,6 @@ pub struct TenantCreateRequest {
pub walreceiver_connect_timeout: Option<String>,
pub lagging_wal_timeout: Option<String>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub trace_read_requests: Option<bool>,
}
#[serde_as]
@@ -113,7 +91,6 @@ pub struct TenantConfigRequest {
pub walreceiver_connect_timeout: Option<String>,
pub lagging_wal_timeout: Option<String>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub trace_read_requests: Option<bool>,
}
impl TenantConfigRequest {
@@ -132,7 +109,6 @@ impl TenantConfigRequest {
walreceiver_connect_timeout: None,
lagging_wal_timeout: None,
max_lsn_wal_lag: None,
trace_read_requests: None,
}
}
}
@@ -184,8 +160,6 @@ pub struct TimelineInfo {
pub remote_consistent_lsn: Option<Lsn>,
pub awaits_download: bool,
pub state: TimelineState,
// Some of the above fields are duplicated in 'local' and 'remote', for backwards-
// compatility with older clients.
pub local: LocalTimelineInfo,
@@ -227,262 +201,3 @@ pub struct FailpointConfig {
pub struct TimelineGcRequest {
pub gc_horizon: Option<u64>,
}
// Wrapped in libpq CopyData
#[derive(PartialEq, Eq)]
pub enum PagestreamFeMessage {
Exists(PagestreamExistsRequest),
Nblocks(PagestreamNblocksRequest),
GetPage(PagestreamGetPageRequest),
DbSize(PagestreamDbSizeRequest),
}
// Wrapped in libpq CopyData
pub enum PagestreamBeMessage {
Exists(PagestreamExistsResponse),
Nblocks(PagestreamNblocksResponse),
GetPage(PagestreamGetPageResponse),
Error(PagestreamErrorResponse),
DbSize(PagestreamDbSizeResponse),
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamExistsRequest {
pub latest: bool,
pub lsn: Lsn,
pub rel: RelTag,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamNblocksRequest {
pub latest: bool,
pub lsn: Lsn,
pub rel: RelTag,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetPageRequest {
pub latest: bool,
pub lsn: Lsn,
pub rel: RelTag,
pub blkno: u32,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamDbSizeRequest {
pub latest: bool,
pub lsn: Lsn,
pub dbnode: u32,
}
#[derive(Debug)]
pub struct PagestreamExistsResponse {
pub exists: bool,
}
#[derive(Debug)]
pub struct PagestreamNblocksResponse {
pub n_blocks: u32,
}
#[derive(Debug)]
pub struct PagestreamGetPageResponse {
pub page: Bytes,
}
#[derive(Debug)]
pub struct PagestreamErrorResponse {
pub message: String,
}
#[derive(Debug)]
pub struct PagestreamDbSizeResponse {
pub db_size: i64,
}
impl PagestreamFeMessage {
pub fn serialize(&self) -> Bytes {
let mut bytes = BytesMut::new();
match self {
Self::Exists(req) => {
bytes.put_u8(0);
bytes.put_u8(if req.latest { 1 } else { 0 });
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
bytes.put_u8(req.rel.forknum);
}
Self::Nblocks(req) => {
bytes.put_u8(1);
bytes.put_u8(if req.latest { 1 } else { 0 });
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
bytes.put_u8(req.rel.forknum);
}
Self::GetPage(req) => {
bytes.put_u8(2);
bytes.put_u8(if req.latest { 1 } else { 0 });
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
bytes.put_u8(req.rel.forknum);
bytes.put_u32(req.blkno);
}
Self::DbSize(req) => {
bytes.put_u8(3);
bytes.put_u8(if req.latest { 1 } else { 0 });
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.dbnode);
}
}
bytes.into()
}
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
// TODO these gets can fail
// these correspond to the NeonMessageTag enum in pagestore_client.h
//
// TODO: consider using protobuf or serde bincode for less error prone
// serialization.
let msg_tag = body.read_u8()?;
match msg_tag {
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
})),
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
})),
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
blkno: body.read_u32::<BigEndian>()?,
})),
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
dbnode: body.read_u32::<BigEndian>()?,
})),
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
}
}
}
impl PagestreamBeMessage {
pub fn serialize(&self) -> Bytes {
let mut bytes = BytesMut::new();
match self {
Self::Exists(resp) => {
bytes.put_u8(100); /* tag from pagestore_client.h */
bytes.put_u8(resp.exists as u8);
}
Self::Nblocks(resp) => {
bytes.put_u8(101); /* tag from pagestore_client.h */
bytes.put_u32(resp.n_blocks);
}
Self::GetPage(resp) => {
bytes.put_u8(102); /* tag from pagestore_client.h */
bytes.put(&resp.page[..]);
}
Self::Error(resp) => {
bytes.put_u8(103); /* tag from pagestore_client.h */
bytes.put(resp.message.as_bytes());
bytes.put_u8(0); // null terminator
}
Self::DbSize(resp) => {
bytes.put_u8(104); /* tag from pagestore_client.h */
bytes.put_i64(resp.db_size);
}
}
bytes.into()
}
}
#[cfg(test)]
mod tests {
use bytes::Buf;
use super::*;
#[test]
fn test_pagestream() {
// Test serialization/deserialization of PagestreamFeMessage
let messages = vec![
PagestreamFeMessage::Exists(PagestreamExistsRequest {
latest: true,
lsn: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
dbnode: 3,
relnode: 4,
},
}),
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
latest: false,
lsn: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
dbnode: 3,
relnode: 4,
},
}),
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
latest: true,
lsn: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
dbnode: 3,
relnode: 4,
},
blkno: 7,
}),
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
latest: true,
lsn: Lsn(4),
dbnode: 7,
}),
];
for msg in messages {
let bytes = msg.serialize();
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
assert!(msg == reconstructed);
}
}
}

View File

@@ -37,22 +37,22 @@ pub static REQUIRED_POSTGRES_CONFIG: Lazy<Vec<&'static str>> = Lazy::new(|| {
});
impl Conf {
pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
pub fn pg_distrib_dir(&self) -> PathBuf {
let path = self.pg_distrib_dir.clone();
match self.pg_version {
14 => Ok(path.join(format!("v{}", self.pg_version))),
15 => Ok(path.join(format!("v{}", self.pg_version))),
_ => bail!("Unsupported postgres version: {}", self.pg_version),
14 => path.join(format!("v{}", self.pg_version)),
15 => path.join(format!("v{}", self.pg_version)),
_ => panic!("Unsupported postgres version: {}", self.pg_version),
}
}
fn pg_bin_dir(&self) -> anyhow::Result<PathBuf> {
Ok(self.pg_distrib_dir()?.join("bin"))
fn pg_bin_dir(&self) -> PathBuf {
self.pg_distrib_dir().join("bin")
}
fn pg_lib_dir(&self) -> anyhow::Result<PathBuf> {
Ok(self.pg_distrib_dir()?.join("lib"))
fn pg_lib_dir(&self) -> PathBuf {
self.pg_distrib_dir().join("lib")
}
pub fn wal_dir(&self) -> PathBuf {
@@ -60,12 +60,12 @@ impl Conf {
}
fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
let path = self.pg_bin_dir()?.join(command);
let path = self.pg_bin_dir().join(command);
ensure!(path.exists(), "Command {:?} does not exist", path);
let mut cmd = Command::new(path);
cmd.env_clear()
.env("LD_LIBRARY_PATH", self.pg_lib_dir()?)
.env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?);
.env("LD_LIBRARY_PATH", self.pg_lib_dir())
.env("DYLD_LIBRARY_PATH", self.pg_lib_dir());
Ok(cmd)
}

View File

@@ -1,16 +0,0 @@
[package]
name = "pq_proto"
version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1.0"
bytes = "1.0.1"
pin-project-lite = "0.2.7"
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
rand = "0.8.3"
serde = { version = "1.0", features = ["derive"] }
tokio = { version = "1.17", features = ["macros"] }
tracing = "0.1"
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -16,7 +16,7 @@ use tokio::{
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
};
use tracing::*;
use utils::crashsafe::path_with_suffix_extension;
use utils::crashsafe_dir::path_with_suffix_extension;
use crate::{Download, DownloadError, RemoteObjectId};

View File

@@ -1,3 +0,0 @@
*.dot
*.png
*.svg

View File

@@ -1,8 +0,0 @@
[package]
name = "tenant_size_model"
version = "0.1.0"
edition = "2021"
publish = false
[dependencies]
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -1,13 +0,0 @@
all: 1.svg 2.svg 3.svg 4.svg 1.png 2.png 3.png 4.png
../../target/debug/tenant_size_model: Cargo.toml src/main.rs src/lib.rs
cargo build --bin tenant_size_model
%.svg: %.dot
dot -Tsvg $< > $@
%.png: %.dot
dot -Tpng $< > $@
%.dot: ../../target/debug/tenant_size_model
../../target/debug/tenant_size_model $* > $@

View File

@@ -1,7 +0,0 @@
# Logical size + WAL pricing
This is a simulator to calculate the tenant size in different scenarios,
using the "Logical size + WAL" method. Makefile produces diagrams used in a
private presentation:
https://docs.google.com/presentation/d/1OapE4k11xmcwMh7I7YvNWGC63yCRLh6udO9bXZ-fZmo/edit?usp=sharing

View File

@@ -1,349 +0,0 @@
use std::borrow::Cow;
use std::collections::HashMap;
/// Pricing model or history size builder.
///
/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
/// type.
pub struct Storage<K: 'static> {
segments: Vec<Segment>,
/// Mapping from the branch name to the index of a segment describing it's latest state.
branches: HashMap<K, usize>,
}
/// Snapshot of a branch.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Segment {
/// Previous segment index into ['Storage::segments`], if any.
parent: Option<usize>,
/// Description of how did we get to this state.
///
/// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
/// modifying a branch directly.
pub op: Cow<'static, str>,
/// LSN before this state
start_lsn: u64,
/// LSN at this state
pub end_lsn: u64,
/// Logical size before this state
start_size: u64,
/// Logical size at this state
pub end_size: u64,
/// Indices to [`Storage::segments`]
///
/// FIXME: this could be an Option<usize>
children_after: Vec<usize>,
/// Determined by `retention_period` given to [`Storage::calculate`]
pub needed: bool,
}
//
//
//
//
// *-g--*---D--->
// /
// /
// / *---b----*-B--->
// / /
// / /
// -----*--e---*-----f----* C
// E \
// \
// *--a---*---A-->
//
// If A and B need to be retained, is it cheaper to store
// snapshot at C+a+b, or snapshots at A and B ?
//
// If D also needs to be retained, which is cheaper:
//
// 1. E+g+e+f+a+b
// 2. D+C+a+b
// 3. D+A+B
/// [`Segment`] which has had it's size calculated.
pub struct SegmentSize {
pub seg_id: usize,
pub method: SegmentMethod,
this_size: u64,
pub children: Vec<SegmentSize>,
}
impl SegmentSize {
fn total(&self) -> u64 {
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
}
pub fn total_children(&self) -> u64 {
if self.method == SnapshotAfter {
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
} else {
self.children.iter().fold(0, |acc, x| acc + x.total())
}
}
}
/// Different methods to retain history from a particular state
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum SegmentMethod {
SnapshotAfter,
Wal,
WalNeeded,
Skipped,
}
use SegmentMethod::*;
impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
/// Creates a new storage with the given default branch name.
pub fn new(initial_branch: K) -> Storage<K> {
let init_segment = Segment {
op: "".into(),
needed: false,
parent: None,
start_lsn: 0,
end_lsn: 0,
start_size: 0,
end_size: 0,
children_after: Vec::new(),
};
Storage {
segments: vec![init_segment],
branches: HashMap::from([(initial_branch, 0)]),
}
}
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
pub fn modify_branch<Q: ?Sized>(
&mut self,
branch: &Q,
op: Cow<'static, str>,
lsn_bytes: u64,
size_bytes: i64,
) where
K: std::borrow::Borrow<Q>,
Q: std::hash::Hash + Eq,
{
let lastseg_id = *self.branches.get(branch).unwrap();
let newseg_id = self.segments.len();
let lastseg = &mut self.segments[lastseg_id];
let newseg = Segment {
op,
parent: Some(lastseg_id),
start_lsn: lastseg.end_lsn,
end_lsn: lastseg.end_lsn + lsn_bytes,
start_size: lastseg.end_size,
end_size: (lastseg.end_size as i64 + size_bytes) as u64,
children_after: Vec::new(),
needed: false,
};
lastseg.children_after.push(newseg_id);
self.segments.push(newseg);
*self.branches.get_mut(branch).expect("read already") = newseg_id;
}
pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
where
K: std::borrow::Borrow<Q>,
Q: std::hash::Hash + Eq,
{
self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
}
pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
where
K: std::borrow::Borrow<Q>,
Q: std::hash::Hash + Eq,
{
self.modify_branch(branch, "update".into(), bytes, 0i64);
}
pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
where
K: std::borrow::Borrow<Q>,
Q: std::hash::Hash + Eq,
{
self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
}
/// Panics if the parent branch cannot be found.
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K)
where
K: std::borrow::Borrow<Q>,
Q: std::hash::Hash + Eq,
{
// Find the right segment
let branchseg_id = *self
.branches
.get(parent)
.expect("should had found the parent by key");
let _branchseg = &mut self.segments[branchseg_id];
// Create branch name for it
self.branches.insert(name, branchseg_id);
}
pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
// Phase 1: Mark all the segments that need to be retained
for (_branch, &last_seg_id) in self.branches.iter() {
let last_seg = &self.segments[last_seg_id];
let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
let mut seg_id = last_seg_id;
loop {
let seg = &mut self.segments[seg_id];
if seg.end_lsn < cutoff_lsn {
break;
}
seg.needed = true;
if let Some(prev_seg_id) = seg.parent {
seg_id = prev_seg_id;
} else {
break;
}
}
}
// Phase 2: For each oldest segment in a chain that needs to be retained,
// calculate if we should store snapshot or WAL
self.size_from_snapshot_later(0)
}
fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
let seg = &self.segments[seg_id];
let this_size = seg.end_lsn - seg.start_lsn;
let mut children = Vec::new();
// try both ways
for &child_id in seg.children_after.iter() {
// try each child both ways
let child = &self.segments[child_id];
let p1 = self.size_from_wal(child_id);
let p = if !child.needed {
let p2 = self.size_from_snapshot_later(child_id);
if p1.total() < p2.total() {
p1
} else {
p2
}
} else {
p1
};
children.push(p);
}
SegmentSize {
seg_id,
method: if seg.needed { WalNeeded } else { Wal },
this_size,
children,
}
}
fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
// If this is needed, then it's time to do the snapshot and continue
// with wal method.
let seg = &self.segments[seg_id];
//eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
if seg.needed {
let mut children = Vec::new();
for &child_id in seg.children_after.iter() {
// try each child both ways
let child = &self.segments[child_id];
let p1 = self.size_from_wal(child_id);
let p = if !child.needed {
let p2 = self.size_from_snapshot_later(child_id);
if p1.total() < p2.total() {
p1
} else {
p2
}
} else {
p1
};
children.push(p);
}
SegmentSize {
seg_id,
method: WalNeeded,
this_size: seg.start_size,
children,
}
} else {
// If any of the direct children are "needed", need to be able to reconstruct here
let mut children_needed = false;
for &child in seg.children_after.iter() {
let seg = &self.segments[child];
if seg.needed {
children_needed = true;
break;
}
}
let method1 = if !children_needed {
let mut children = Vec::new();
for child in seg.children_after.iter() {
children.push(self.size_from_snapshot_later(*child));
}
Some(SegmentSize {
seg_id,
method: Skipped,
this_size: 0,
children,
})
} else {
None
};
// If this a junction, consider snapshotting here
let method2 = if children_needed || seg.children_after.len() >= 2 {
let mut children = Vec::new();
for child in seg.children_after.iter() {
children.push(self.size_from_wal(*child));
}
Some(SegmentSize {
seg_id,
method: SnapshotAfter,
this_size: seg.end_size,
children,
})
} else {
None
};
match (method1, method2) {
(None, None) => panic!(),
(Some(method), None) => method,
(None, Some(method)) => method,
(Some(method1), Some(method2)) => {
if method1.total() < method2.total() {
method1
} else {
method2
}
}
}
}
}
pub fn into_segments(self) -> Vec<Segment> {
self.segments
}
}

View File

@@ -1,268 +0,0 @@
//! Tenant size model testing ground.
//!
//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
//! into pngs.
use tenant_size_model::{Segment, SegmentSize, Storage};
// Main branch only. Some updates on it.
fn scenario_1() -> (Vec<Segment>, SegmentSize) {
// Create main branch
let mut storage = Storage::new("main");
// Bulk load 5 GB of data to it
storage.insert("main", 5_000);
// Stream of updates
for _ in 0..5 {
storage.update("main", 1_000);
}
let size = storage.calculate(1000);
(storage.into_segments(), size)
}
// Main branch only. Some updates on it.
fn scenario_2() -> (Vec<Segment>, SegmentSize) {
// Create main branch
let mut storage = Storage::new("main");
// Bulk load 5 GB of data to it
storage.insert("main", 5_000);
// Stream of updates
for _ in 0..5 {
storage.update("main", 1_000);
}
// Branch
storage.branch("main", "child");
storage.update("child", 1_000);
// More updates on parent
storage.update("main", 1_000);
let size = storage.calculate(1000);
(storage.into_segments(), size)
}
// Like 2, but more updates on main
fn scenario_3() -> (Vec<Segment>, SegmentSize) {
// Create main branch
let mut storage = Storage::new("main");
// Bulk load 5 GB of data to it
storage.insert("main", 5_000);
// Stream of updates
for _ in 0..5 {
storage.update("main", 1_000);
}
// Branch
storage.branch("main", "child");
storage.update("child", 1_000);
// More updates on parent
for _ in 0..5 {
storage.update("main", 1_000);
}
let size = storage.calculate(1000);
(storage.into_segments(), size)
}
// Diverged branches
fn scenario_4() -> (Vec<Segment>, SegmentSize) {
// Create main branch
let mut storage = Storage::new("main");
// Bulk load 5 GB of data to it
storage.insert("main", 5_000);
// Stream of updates
for _ in 0..5 {
storage.update("main", 1_000);
}
// Branch
storage.branch("main", "child");
storage.update("child", 1_000);
// More updates on parent
for _ in 0..8 {
storage.update("main", 1_000);
}
let size = storage.calculate(1000);
(storage.into_segments(), size)
}
fn scenario_5() -> (Vec<Segment>, SegmentSize) {
let mut storage = Storage::new("a");
storage.insert("a", 5000);
storage.branch("a", "b");
storage.update("b", 4000);
storage.update("a", 2000);
storage.branch("a", "c");
storage.insert("c", 4000);
storage.insert("a", 2000);
let size = storage.calculate(5000);
(storage.into_segments(), size)
}
fn scenario_6() -> (Vec<Segment>, SegmentSize) {
use std::borrow::Cow;
const NO_OP: Cow<'static, str> = Cow::Borrowed("");
let branches = [
Some(0x7ff1edab8182025f15ae33482edb590a_u128),
Some(0xb1719e044db05401a05a2ed588a3ad3f),
Some(0xb68d6691c895ad0a70809470020929ef),
];
// compared to other scenarios, this one uses bytes instead of kB
let mut storage = Storage::new(None);
storage.branch(&None, branches[0]); // at 0
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
storage.branch(&branches[0], branches[1]); // at 108951064
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
storage.branch(&branches[0], branches[2]); // at 283415424
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
let size = storage.calculate(100_000);
(storage.into_segments(), size)
}
fn main() {
let args: Vec<String> = std::env::args().collect();
let scenario = if args.len() < 2 { "1" } else { &args[1] };
let (segments, size) = match scenario {
"1" => scenario_1(),
"2" => scenario_2(),
"3" => scenario_3(),
"4" => scenario_4(),
"5" => scenario_5(),
"6" => scenario_6(),
other => {
eprintln!("invalid scenario {}", other);
std::process::exit(1);
}
};
graphviz_tree(&segments, &size);
}
fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
use tenant_size_model::SegmentMethod::*;
let seg_id = node.seg_id;
let seg = segments.get(seg_id).unwrap();
let lsn = seg.end_lsn;
let size = seg.end_size;
let method = node.method;
println!(" {{");
println!(" node [width=0.1 height=0.1 shape=oval]");
let tenant_size = node.total_children();
let penwidth = if seg.needed { 6 } else { 3 };
let x = match method {
SnapshotAfter =>
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
Wal =>
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
WalNeeded =>
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
Skipped =>
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
};
println!(" \"seg{seg_id}\" [{x}]");
println!(" }}");
// Recurse. Much of the data is actually on the edge
for child in node.children.iter() {
let child_id = child.seg_id;
graphviz_recurse(segments, child);
let edge_color = match child.method {
SnapshotAfter => "gray",
Wal => "black",
WalNeeded => "black",
Skipped => "gray",
};
println!(" {{");
println!(" edge [] ");
print!(" \"seg{seg_id}\" -> \"seg{child_id}\" [");
print!("color={edge_color}");
if child.method == WalNeeded {
print!(" penwidth=6");
}
if child.method == Wal {
print!(" penwidth=3");
}
let next = segments.get(child_id).unwrap();
if next.op.is_empty() {
print!(
" label=\"{} / {}\"",
next.end_lsn - seg.end_lsn,
(next.end_size as i128 - seg.end_size as i128)
);
} else {
print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
}
println!("]");
println!(" }}");
}
}
fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
println!("digraph G {{");
println!(" fontname=\"Helvetica,Arial,sans-serif\"");
println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]");
println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]");
println!(" graph [center=1 rankdir=LR]");
println!(" edge [dir=none]");
graphviz_recurse(segments, tree);
println!("}}");
}
#[test]
fn scenarios_return_same_size() {
type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
let truths: &[(u32, ScenarioFn, _)] = &[
(line!(), scenario_1, 8000),
(line!(), scenario_2, 9000),
(line!(), scenario_3, 13000),
(line!(), scenario_4, 16000),
(line!(), scenario_5, 17000),
(line!(), scenario_6, 333_792_000),
];
for (line, scenario, expected) in truths {
let (_, size) = scenario();
assert_eq!(*expected, size.total_children(), "scenario on line {line}");
}
}

View File

@@ -9,6 +9,9 @@ anyhow = "1.0"
bincode = "1.3"
bytes = "1.0.1"
hyper = { version = "0.14.7", features = ["full"] }
pin-project-lite = "0.2.7"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
routerify = "3"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
@@ -16,7 +19,7 @@ thiserror = "1.0"
tokio = { version = "1.17", features = ["macros"]}
tokio-rustls = "0.23"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
nix = "0.25"
signal-hook = "0.3.10"
rand = "0.8.3"
@@ -27,11 +30,9 @@ rustls-split = "0.3.0"
git-version = "0.3.5"
serde_with = "2.0"
once_cell = "1.13.0"
strum = "0.24"
strum_macros = "0.24"
metrics = { path = "../metrics" }
pq_proto = { path = "../pq_proto" }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[dev-dependencies]

View File

@@ -0,0 +1,52 @@
use postgres::Config;
pub fn connection_host_port(config: &Config) -> (String, u16) {
assert_eq!(
config.get_hosts().len(),
1,
"only one pair of host and port is supported in connection string"
);
assert_eq!(
config.get_ports().len(),
1,
"only one pair of host and port is supported in connection string"
);
let host = match &config.get_hosts()[0] {
postgres::config::Host::Tcp(host) => host.as_ref(),
postgres::config::Host::Unix(host) => host.to_str().unwrap(),
};
(host.to_owned(), config.get_ports()[0])
}
pub fn connection_address(config: &Config) -> String {
let (host, port) = connection_host_port(config);
format!("{}:{}", host, port)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_connection_host_port() {
let config: Config = "postgresql://no_user@localhost:64000/no_db"
.parse()
.unwrap();
assert_eq!(
connection_host_port(&config),
("localhost".to_owned(), 64000)
);
}
#[test]
#[should_panic(expected = "only one pair of host and port is supported in connection string")]
fn test_connection_host_port_multiple_ports() {
let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db"
.parse()
.unwrap();
assert_eq!(
connection_host_port(&config),
("localhost".to_owned(), 64000)
);
}
}

View File

@@ -12,8 +12,16 @@ pub fn create_dir(path: impl AsRef<Path>) -> io::Result<()> {
let path = path.as_ref();
fs::create_dir(path)?;
fsync_file_and_parent(path)?;
Ok(())
File::open(path)?.sync_all()?;
if let Some(parent) = path.parent() {
File::open(parent)?.sync_all()
} else {
Err(io::Error::new(
io::ErrorKind::InvalidInput,
"can't find parent",
))
}
}
/// Similar to [`std::fs::create_dir_all`], except we fsync all
@@ -57,12 +65,12 @@ pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
// Fsync the created directories from child to parent.
for &path in dirs_to_create.iter() {
fsync(path)?;
File::open(path)?.sync_all()?;
}
// If we created any new directories, fsync the parent.
if !dirs_to_create.is_empty() {
fsync(path)?;
File::open(path)?.sync_all()?;
}
Ok(())
@@ -84,33 +92,6 @@ pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str)
.with_extension(new_extension.as_ref())
}
pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> {
let parent = file_path.parent().ok_or_else(|| {
io::Error::new(
io::ErrorKind::Other,
format!("File {file_path:?} has no parent"),
)
})?;
fsync(file_path)?;
fsync(parent)?;
Ok(())
}
pub fn fsync(path: &Path) -> io::Result<()> {
File::open(path)
.map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}")))
.and_then(|file| {
file.sync_all().map_err(|e| {
io::Error::new(
e.kind(),
format!("Failed to sync file {path:?} data and metadata: {e}"),
)
})
})
.map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
}
#[cfg(test)]
mod tests {
use tempfile::tempdir;

View File

@@ -75,12 +75,6 @@ impl From<[u8; 16]> for Id {
}
}
impl From<Id> for u128 {
fn from(id: Id) -> Self {
u128::from_le_bytes(id.0)
}
}
impl fmt::Display for Id {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.hex_encode())
@@ -142,12 +136,6 @@ macro_rules! id_newtype {
}
}
impl From<$t> for u128 {
fn from(id: $t) -> Self {
u128::from(id.0)
}
}
impl fmt::Display for $t {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.0.fmt(f)
@@ -204,17 +192,6 @@ pub struct TenantId(Id);
id_newtype!(TenantId);
/// Neon Connection Id identifies long-lived connections (for example a pagestream
/// connection with the page_service). Is used for better logging and tracing
///
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
/// See [`Id`] for alternative ways to serialize it.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
pub struct ConnectionId(Id);
id_newtype!(ConnectionId);
// A pair uniquely identifying Neon instance.
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct TenantTimelineId {

View File

@@ -1,6 +1,8 @@
//! `utils` is intended to be a place to put code that is shared
//! between other crates in this repository.
#![allow(clippy::manual_range_contains)]
/// `Lsn` type implements common tasks on Log Sequence Numbers
pub mod lsn;
/// SeqWait allows waiting for a future sequence number to arrive
@@ -15,9 +17,13 @@ pub mod vec_map;
pub mod bin_ser;
pub mod postgres_backend;
pub mod postgres_backend_async;
pub mod pq_proto;
// helper functions for creating and fsyncing
pub mod crashsafe;
// dealing with connstring parsing and handy access to it's parts
pub mod connstring;
// helper functions for creating and fsyncing directories/trees
pub mod crashsafe_dir;
// common authentication routines
pub mod auth;
@@ -33,12 +39,13 @@ pub mod sock_split;
// common log initialisation routine
pub mod logging;
pub mod lock_file;
// Misc
pub mod accum;
pub mod shutdown;
// Tools for calling certain async methods in sync contexts
pub mod sync;
// Utility for binding TcpListeners with proper socket options.
pub mod tcp_listener;

View File

@@ -1,81 +0,0 @@
//! A module to create and read lock files. A lock file ensures that only one
//! process is running at a time, in a particular directory.
//!
//! File locking is done using [`fcntl::flock`], which means that holding the
//! lock on file only prevents acquiring another lock on it; all other
//! operations are still possible on files. Other process can still open, read,
//! write, or remove the file, for example.
//! If the file is removed while a process is holding a lock on it,
//! the process that holds the lock does not get any error or notification.
//! Furthermore, you can create a new file with the same name and lock the new file,
//! while the old process is still running.
//! Deleting the lock file while the locking process is still running is a bad idea!
use std::{fs, os::unix::prelude::AsRawFd, path::Path};
use anyhow::Context;
use nix::fcntl;
use crate::crashsafe;
pub enum LockCreationResult {
Created {
new_lock_contents: String,
file: fs::File,
},
AlreadyLocked {
existing_lock_contents: String,
},
CreationFailed(anyhow::Error),
}
/// Creates a lock file in the path given and writes the given contents into the file.
/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
let lock_file = match fs::OpenOptions::new()
.create(true) // O_CREAT
.write(true)
.open(lock_file_path)
.context("Failed to open lock file")
{
Ok(file) => file,
Err(e) => return LockCreationResult::CreationFailed(e),
};
match fcntl::flock(
lock_file.as_raw_fd(),
fcntl::FlockArg::LockExclusiveNonblock,
) {
Ok(()) => {
match lock_file
.set_len(0)
.context("Failed to truncate lockfile")
.and_then(|()| {
fs::write(lock_file_path, &contents).with_context(|| {
format!("Failed to write '{contents}' contents into lockfile")
})
})
.and_then(|()| {
crashsafe::fsync_file_and_parent(lock_file_path)
.context("Failed to fsync lockfile")
}) {
Ok(()) => LockCreationResult::Created {
new_lock_contents: contents,
file: lock_file,
},
Err(e) => LockCreationResult::CreationFailed(e),
}
}
Err(nix::errno::Errno::EAGAIN) => {
match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
existing_lock_contents,
},
Err(e) => LockCreationResult::CreationFailed(e),
}
}
Err(e) => {
LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
}
}
}

View File

@@ -1,28 +1,19 @@
use std::str::FromStr;
use std::{
fs::{File, OpenOptions},
path::Path,
};
use anyhow::Context;
use strum_macros::{EnumString, EnumVariantNames};
use anyhow::{Context, Result};
#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
#[strum(serialize_all = "snake_case")]
pub enum LogFormat {
Plain,
Json,
}
pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {
// Don't open the same file for output multiple times;
// the different fds could overwrite each other's output.
let log_file = OpenOptions::new()
.create(true)
.append(true)
.open(&log_filename)
.with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?;
impl LogFormat {
pub fn from_config(s: &str) -> anyhow::Result<LogFormat> {
use strum::VariantNames;
LogFormat::from_str(s).with_context(|| {
format!(
"Unrecognized log format. Please specify one of: {:?}",
LogFormat::VARIANTS
)
})
}
}
pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
let default_filter_str = "info";
// We fall back to printing all spans at info-level or above if
@@ -32,14 +23,20 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
let base_logger = tracing_subscriber::fmt()
.with_env_filter(env_filter)
.with_target(false)
.with_ansi(false)
.with_writer(std::io::stdout);
.with_target(false) // don't include event targets
.with_ansi(false); // don't use colors in log file;
match log_format {
LogFormat::Json => base_logger.json().init(),
LogFormat::Plain => base_logger.init(),
// we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
// if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
// for example to be in line with docker log command which expects logs comimg from stdout
if daemonize {
let x = log_file.try_clone().unwrap();
base_logger
.with_writer(move || x.try_clone().unwrap())
.init();
} else {
base_logger.init();
}
Ok(())
Ok(log_file)
}

View File

@@ -13,7 +13,7 @@ use crate::seqwait::MonotonicCounter;
pub const XLOG_BLCKSZ: u32 = 8192;
/// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Serialize, Deserialize)]
#[serde(transparent)]
pub struct Lsn(pub u64);
@@ -138,7 +138,7 @@ impl FromStr for Lsn {
///
/// If the input string is missing the '/' character, then use `Lsn::from_hex`
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut splitter = s.trim().split('/');
let mut splitter = s.split('/');
if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next())
{
let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
@@ -270,11 +270,6 @@ mod tests {
);
assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0)));
assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError));
let expected_lsn = Lsn(0x3C490F8);
assert_eq!(" 0/3C490F8".parse(), Ok(expected_lsn));
assert_eq!("0/3C490F8 ".parse(), Ok(expected_lsn));
assert_eq!(" 0/3C490F8 ".parse(), Ok(expected_lsn));
}
#[test]

View File

@@ -3,10 +3,10 @@
//! implementation determining how to process the queries. Currently its API
//! is rather narrow, but we can extend it once required.
use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
use crate::sock_split::{BidiStream, ReadStream, WriteStream};
use anyhow::{bail, ensure, Context, Result};
use bytes::{Bytes, BytesMut};
use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
use rand::Rng;
use serde::{Deserialize, Serialize};
use std::fmt;

View File

@@ -4,9 +4,9 @@
//! is rather narrow, but we can extend it once required.
use crate::postgres_backend::AuthType;
use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
use anyhow::{bail, Context, Result};
use bytes::{Bytes, BytesMut};
use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
use rand::Rng;
use std::future::Future;
use std::net::SocketAddr;

View File

@@ -2,9 +2,7 @@
//! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
//! on message formats.
// Tools for calling certain async methods in sync contexts.
pub mod sync;
use crate::sync::{AsyncishRead, SyncFuture};
use anyhow::{bail, ensure, Context, Result};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use postgres_protocol::PG_EPOCH;
@@ -18,7 +16,6 @@ use std::{
str,
time::{Duration, SystemTime},
};
use sync::{AsyncishRead, SyncFuture};
use tokio::io::AsyncReadExt;
use tracing::{trace, warn};
@@ -201,7 +198,7 @@ impl FeMessage {
///
/// ```
/// # use std::io;
/// # use pq_proto::FeMessage;
/// # use utils::pq_proto::FeMessage;
/// #
/// # fn process_message(msg: FeMessage) -> anyhow::Result<()> {
/// # Ok(())
@@ -305,7 +302,6 @@ impl FeStartupPacket {
Err(e) => return Err(e.into()),
};
#[allow(clippy::manual_range_contains)]
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
bail!("invalid message length");
}

View File

@@ -29,7 +29,7 @@ impl<S, T: Future> SyncFuture<S, T> {
/// Example:
///
/// ```
/// # use pq_proto::sync::SyncFuture;
/// # use utils::sync::SyncFuture;
/// # use std::future::Future;
/// # use tokio::io::AsyncReadExt;
/// #

View File

@@ -12,61 +12,62 @@ testing = ["fail/failpoints"]
profiling = ["pprof"]
[dependencies]
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
anyhow = { version = "1.0", features = ["backtrace"] }
async-stream = "0.3"
async-trait = "0.1"
byteorder = "1.4.3"
bytes = "1.0.1"
chrono = "0.4.19"
clap = { version = "4.0", features = ["string"] }
close_fds = "0.3.2"
const_format = "0.2.21"
crc32c = "0.6.0"
crossbeam-utils = "0.8.5"
fail = "0.5.0"
futures = "0.3.13"
git-version = "0.3.5"
hex = "0.4.3"
humantime = "2.1.0"
humantime-serde = "1.1.1"
hyper = "0.14"
itertools = "0.10.3"
nix = "0.25"
num-traits = "0.2.15"
once_cell = "1.13.0"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
rand = "0.8.3"
regex = "1.4.5"
rstar = "0.9.3"
scopeguard = "1.1.0"
bytes = "1.0.1"
byteorder = "1.4.3"
futures = "0.3.13"
hex = "0.4.3"
hyper = "0.14"
itertools = "0.10.3"
clap = { version = "4.0", features = ["string"] }
daemonize = "0.4.1"
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
anyhow = { version = "1.0", features = ["backtrace"] }
crc32c = "0.6.0"
thiserror = "1.0"
tar = "0.4.33"
humantime = "2.1.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_with = "2.0"
signal-hook = "0.3.10"
svg_fmt = "0.4.1"
tar = "0.4.33"
thiserror = "1.0"
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
toml_edit = { version = "0.14", features = ["easy"] }
tracing = "0.1.36"
url = "2"
walkdir = "2.3.2"
humantime-serde = "1.1.1"
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
toml_edit = { version = "0.14", features = ["easy"] }
scopeguard = "1.1.0"
const_format = "0.2.21"
tracing = "0.1.36"
signal-hook = "0.3.10"
url = "2"
nix = "0.25"
once_cell = "1.13.0"
crossbeam-utils = "0.8.5"
fail = "0.5.0"
git-version = "0.3.5"
rstar = "0.9.3"
num-traits = "0.2.15"
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
etcd_broker = { path = "../libs/etcd_broker" }
metrics = { path = "../libs/metrics" }
pageserver_api = { path = "../libs/pageserver_api" }
postgres_ffi = { path = "../libs/postgres_ffi" }
pq_proto = { path = "../libs/pq_proto" }
remote_storage = { path = "../libs/remote_storage" }
tenant_size_model = { path = "../libs/tenant_size_model" }
etcd_broker = { path = "../libs/etcd_broker" }
metrics = { path = "../libs/metrics" }
utils = { path = "../libs/utils" }
remote_storage = { path = "../libs/remote_storage" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }
close_fds = "0.3.2"
walkdir = "2.3.2"
dashmap = "5.4.0"
[dev-dependencies]
criterion = "0.4"
@@ -76,7 +77,3 @@ tempfile = "3.2"
[[bench]]
name = "bench_layer_map"
harness = false
[[bench]]
name = "bench_walredo"
harness = false

File diff suppressed because one or more lines are too long

View File

@@ -22,8 +22,8 @@ use std::time::SystemTime;
use tar::{Builder, EntryType, Header};
use tracing::*;
use crate::reltag::{RelTag, SlruKind};
use crate::tenant::Timeline;
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};

View File

@@ -1,150 +0,0 @@
//! A tool for visualizing the arrangement of layerfiles within a timeline.
//!
//! It reads filenames from stdin and prints a svg on stdout. The image is a plot in
//! page-lsn space, where every delta layer is a rectangle and every image layer is a
//! thick line. Legend:
//! - The x axis (left to right) represents page index.
//! - The y axis represents LSN, growing upwards.
//!
//! Coordinates in both axis are compressed for better readability.
//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
//!
//! Example use:
//! ```
//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
//! $ firefox out.svg
//! ```
//!
//! This API was chosen so that we can easily work with filenames extracted from ssh,
//! or from pageserver log files.
//!
//! TODO Consider shipping this as a grafana panel plugin:
//! https://grafana.com/tutorials/build-a-panel-plugin/
use anyhow::Result;
use pageserver::repository::Key;
use std::cmp::Ordering;
use std::io::{self, BufRead};
use std::{
collections::{BTreeMap, BTreeSet},
ops::Range,
};
use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke};
use utils::{lsn::Lsn, project_git_version};
project_git_version!(GIT_VERSION);
// Map values to their compressed coordinate - the index the value
// would have in a sorted and deduplicated list of all values.
fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T, usize> {
let set: BTreeSet<T> = coords.into_iter().collect();
let mut map: BTreeMap<T, usize> = BTreeMap::new();
for (i, e) in set.iter().enumerate() {
map.insert(*e, i);
}
map
}
fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
let split: Vec<&str> = name.split("__").collect();
let keys: Vec<&str> = split[0].split('-').collect();
let mut lsns: Vec<&str> = split[1].split('-').collect();
if lsns.len() == 1 {
lsns.push(lsns[0]);
}
let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
(keys, lsns)
}
fn main() -> Result<()> {
// Parse layer filenames from stdin
let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
let stdin = io::stdin();
for line in stdin.lock().lines() {
let range = parse_filename(&line.unwrap());
ranges.push(range);
}
// Collect all coordinates
let mut keys: Vec<Key> = vec![];
let mut lsns: Vec<Lsn> = vec![];
for (keyr, lsnr) in &ranges {
keys.push(keyr.start);
keys.push(keyr.end);
lsns.push(lsnr.start);
lsns.push(lsnr.end);
}
// Analyze
let key_map = build_coordinate_compression_map(keys);
let lsn_map = build_coordinate_compression_map(lsns);
// Initialize stats
let mut num_deltas = 0;
let mut num_images = 0;
// Draw
let stretch = 3.0; // Stretch out vertically for better visibility
println!(
"{}",
BeginSvg {
w: key_map.len() as f32,
h: stretch * lsn_map.len() as f32
}
);
for (keyr, lsnr) in &ranges {
let key_start = *key_map.get(&keyr.start).unwrap();
let key_end = *key_map.get(&keyr.end).unwrap();
let key_diff = key_end - key_start;
let lsn_max = lsn_map.len();
if key_start >= key_end {
panic!("Invalid key range {}-{}", key_start, key_end);
}
let lsn_start = *lsn_map.get(&lsnr.start).unwrap();
let lsn_end = *lsn_map.get(&lsnr.end).unwrap();
let mut lsn_diff = (lsn_end - lsn_start) as f32;
let mut fill = Fill::None;
let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
let mut lsn_offset = 0.0;
// Fill in and thicken rectangle if it's an
// image layer so that we can see it.
match lsn_start.cmp(&lsn_end) {
Ordering::Less => num_deltas += 1,
Ordering::Equal => {
num_images += 1;
lsn_diff = 0.3;
lsn_offset = -lsn_diff / 2.0;
margin = 0.05;
fill = Fill::Color(rgb(0, 0, 0));
}
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
}
println!(
" {}",
rectangle(
key_start as f32 + stretch * margin,
stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
key_diff as f32 - stretch * 2.0 * margin,
stretch * (lsn_diff - 2.0 * margin)
)
.fill(fill)
.stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
.border_radius(0.4)
);
}
println!("{}", EndSvg);
eprintln!("num_images: {}", num_images);
eprintln!("num_deltas: {}", num_deltas);
Ok(())
}

View File

@@ -1,14 +1,17 @@
//! Main entry point for the Page Server executable.
use remote_storage::GenericRemoteStorage;
use std::{env, ops::ControlFlow, path::Path, str::FromStr};
use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
use nix::unistd::Pid;
use tracing::*;
use anyhow::{anyhow, bail, Context, Result};
use clap::{Arg, ArgAction, Command};
use daemonize::Daemonize;
use fail::FailScenario;
use metrics::set_build_info_metric;
use pageserver::{
config::{defaults::*, PageServerConf},
http, page_cache, page_service, profiling, task_mgr,
@@ -16,22 +19,20 @@ use pageserver::{
task_mgr::{
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
},
tenant_mgr, virtual_file,
tenant_mgr, virtual_file, LOG_FILE_NAME,
};
use remote_storage::GenericRemoteStorage;
use utils::{
auth::JwtAuth,
lock_file, logging,
logging,
postgres_backend::AuthType,
project_git_version,
shutdown::exit_now,
signals::{self, Signal},
tcp_listener,
};
project_git_version!(GIT_VERSION);
const PID_FILE_NAME: &str = "pageserver.pid";
const FEATURES: &[&str] = &[
#[cfg(feature = "testing")]
"testing",
@@ -64,7 +65,6 @@ fn main() -> anyhow::Result<()> {
let workdir = workdir
.canonicalize()
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
let cfg_file_path = workdir.join("pageserver.toml");
// Set CWD to workdir for non-daemon modes
@@ -75,6 +75,8 @@ fn main() -> anyhow::Result<()> {
)
})?;
let daemonize = arg_matches.get_flag("daemonize");
let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
ControlFlow::Continue(conf) => conf,
ControlFlow::Break(()) => {
@@ -85,7 +87,7 @@ fn main() -> anyhow::Result<()> {
let tenants_path = conf.tenants_path();
if !tenants_path.exists() {
utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
utils::crashsafe_dir::create_dir_all(conf.tenants_path()).with_context(|| {
format!(
"Failed to create tenants root dir at '{}'",
tenants_path.display()
@@ -100,7 +102,7 @@ fn main() -> anyhow::Result<()> {
virtual_file::init(conf.max_file_descriptors);
page_cache::init(conf.page_cache_size);
start_pageserver(conf).context("Failed to start pageserver")?;
start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
scenario.teardown();
Ok(())
@@ -195,33 +197,11 @@ fn initialize_config(
})
}
fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
logging::init(conf.log_format)?;
info!("version: {}", version());
fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
// Initialize logger
let log_file = logging::init(LOG_FILE_NAME, daemonize)?;
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
lock_file::LockCreationResult::Created {
new_lock_contents,
file,
} => {
info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
file
}
lock_file::LockCreationResult::AlreadyLocked {
existing_lock_contents,
} => anyhow::bail!(
"Could not lock pid file; pageserver is already running in {:?} with PID {}",
conf.workdir,
existing_lock_contents
),
lock_file::LockCreationResult::CreationFailed(e) => {
return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
}
};
// ensure that the lock file is held even if the main thread of the process is panics
// we need to release the lock file only when the current process is gone
let _ = Box::leak(Box::new(lock_file));
info!("version: {}", version());
// TODO: Check that it looks like a valid repository before going further
@@ -238,6 +218,33 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
);
let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
// NB: Don't spawn any threads before daemonizing!
if daemonize {
info!("daemonizing...");
// There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
// that we will see any accidental manual fprintf's or backtraces.
let stdout = log_file
.try_clone()
.with_context(|| format!("Failed to clone log file '{:?}'", log_file))?;
let stderr = log_file;
let daemonize = Daemonize::new()
.pid_file("pageserver.pid")
.working_directory(".")
.stdout(stdout)
.stderr(stderr);
// XXX: The parent process should exit abruptly right after
// it has spawned a child to prevent coverage machinery from
// dumping stats into a `profraw` file now owned by the child.
// Otherwise, the coverage data will be damaged.
match daemonize.exit_action(|| exit_now(0)).start() {
Ok(_) => info!("Success, daemonized"),
Err(err) => bail!("{err}. could not daemonize. bailing."),
}
}
let signals = signals::install_shutdown_handlers()?;
// start profiler (if enabled)
@@ -340,6 +347,14 @@ fn cli() -> Command {
Command::new("Neon page server")
.about("Materializes WAL stream to pages and serves them to the postgres")
.version(version())
.arg(
Arg::new("daemonize")
.short('d')
.long("daemonize")
.action(ArgAction::SetTrue)
.help("Run in the background"),
)
.arg(
Arg::new("init")
.long("init")

View File

@@ -7,10 +7,7 @@
use anyhow::{anyhow, bail, ensure, Context, Result};
use remote_storage::RemoteStorageConfig;
use std::env;
use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId;
use std::num::NonZeroUsize;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::time::Duration;
@@ -19,7 +16,6 @@ use toml_edit::{Document, Item};
use url::Url;
use utils::{
id::{NodeId, TenantId, TimelineId},
logging::LogFormat,
postgres_backend::AuthType,
};
@@ -28,7 +24,6 @@ use crate::tenant_config::{TenantConf, TenantConfOpt};
/// The name of the metadata file pageserver creates per timeline.
pub const METADATA_FILE_NAME: &str = "metadata";
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
const TENANT_CONFIG_NAME: &str = "config";
pub mod defaults {
@@ -48,11 +43,6 @@ pub mod defaults {
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
pub const DEFAULT_LOG_FORMAT: &str = "plain";
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
///
/// Default built-in configuration file.
///
@@ -71,10 +61,6 @@ pub mod defaults {
# initial superuser role name to use when creating a new tenant
#initial_superuser_name = '{DEFAULT_SUPERUSER}'
#log_format = '{DEFAULT_LOG_FORMAT}'
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
# [tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -138,11 +124,6 @@ pub struct PageServerConf {
/// Etcd broker endpoints to connect to.
pub broker_endpoints: Vec<Url>,
pub log_format: LogFormat,
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
}
#[derive(Debug, Clone, PartialEq, Eq)]
@@ -209,10 +190,6 @@ struct PageServerConfigBuilder {
profiling: BuilderValue<ProfilingConfig>,
broker_etcd_prefix: BuilderValue<String>,
broker_endpoints: BuilderValue<Vec<Url>>,
log_format: BuilderValue<LogFormat>,
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
}
impl Default for PageServerConfigBuilder {
@@ -240,9 +217,6 @@ impl Default for PageServerConfigBuilder {
profiling: Set(ProfilingConfig::Disabled),
broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
broker_endpoints: Set(Vec::new()),
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
}
}
}
@@ -315,14 +289,6 @@ impl PageServerConfigBuilder {
self.profiling = BuilderValue::Set(profiling)
}
pub fn log_format(&mut self, log_format: LogFormat) {
self.log_format = BuilderValue::Set(log_format)
}
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) {
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let broker_endpoints = self
.broker_endpoints
@@ -367,12 +333,6 @@ impl PageServerConfigBuilder {
broker_etcd_prefix: self
.broker_etcd_prefix
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
concurrent_tenant_size_logical_size_queries: self
.concurrent_tenant_size_logical_size_queries
.ok_or(anyhow!(
"missing concurrent_tenant_size_logical_size_queries"
))?,
})
}
}
@@ -404,33 +364,6 @@ impl PageServerConf {
self.timelines_path(tenant_id).join(timeline_id.to_string())
}
pub fn timeline_uninit_mark_file_path(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> PathBuf {
path_with_suffix_extension(
self.timeline_path(&timeline_id, &tenant_id),
TIMELINE_UNINIT_MARK_SUFFIX,
)
}
pub fn traces_path(&self) -> PathBuf {
self.workdir.join("traces")
}
pub fn trace_path(
&self,
tenant_id: &TenantId,
timeline_id: &TimelineId,
connection_id: &ConnectionId,
) -> PathBuf {
self.traces_path()
.join(tenant_id.to_string())
.join(timeline_id.to_string())
.join(connection_id.to_string())
}
/// Points to a place in pageserver's local directory,
/// where certain timeline's metadata file should be located.
pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
@@ -441,28 +374,28 @@ impl PageServerConf {
//
// Postgres distribution paths
//
pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf {
let path = self.pg_distrib_dir.clone();
match pg_version {
14 => Ok(path.join(format!("v{pg_version}"))),
15 => Ok(path.join(format!("v{pg_version}"))),
_ => bail!("Unsupported postgres version: {}", pg_version),
14 => path.join(format!("v{pg_version}")),
15 => path.join(format!("v{pg_version}")),
_ => panic!("Unsupported postgres version: {}", pg_version),
}
}
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf {
match pg_version {
14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
_ => bail!("Unsupported postgres version: {}", pg_version),
14 => self.pg_distrib_dir(pg_version).join("bin"),
15 => self.pg_distrib_dir(pg_version).join("bin"),
_ => panic!("Unsupported postgres version: {}", pg_version),
}
}
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf {
match pg_version {
14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
_ => bail!("Unsupported postgres version: {}", pg_version),
14 => self.pg_distrib_dir(pg_version).join("lib"),
15 => self.pg_distrib_dir(pg_version).join("lib"),
_ => panic!("Unsupported postgres version: {}", pg_version),
}
}
@@ -513,15 +446,6 @@ impl PageServerConf {
})
.collect::<anyhow::Result<_>>()?,
),
"log_format" => builder.log_format(
LogFormat::from_config(&parse_toml_string(key, item)?)?
),
"concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
let input = parse_toml_string(key, item)?;
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
ConfigurableSemaphore::new(permits)
}),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -614,9 +538,8 @@ impl PageServerConf {
PathBuf::from(format!("../tmp_check/test_{test_name}"))
}
#[cfg(test)]
pub fn dummy_conf(repo_dir: PathBuf) -> Self {
let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
PageServerConf {
id: NodeId(0),
wait_lsn_timeout: Duration::from_secs(60),
@@ -627,7 +550,7 @@ impl PageServerConf {
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
superuser: "cloud_admin".to_string(),
workdir: repo_dir,
pg_distrib_dir,
pg_distrib_dir: PathBuf::new(),
auth_type: AuthType::Trust,
auth_validation_public_key_path: None,
remote_storage_config: None,
@@ -635,8 +558,6 @@ impl PageServerConf {
default_tenant_conf: TenantConf::dummy_conf(),
broker_endpoints: Vec::new(),
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
}
}
}
@@ -702,58 +623,6 @@ fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
.collect()
}
/// Configurable semaphore permits setting.
///
/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
/// semaphore cannot be distinguished, leading any feature using these to await forever (or until
/// new permits are added).
#[derive(Debug, Clone)]
pub struct ConfigurableSemaphore {
initial_permits: NonZeroUsize,
inner: std::sync::Arc<tokio::sync::Semaphore>,
}
impl ConfigurableSemaphore {
pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
Some(x) => x,
None => panic!("const unwrap is not yet stable"),
};
/// Initializse using a non-zero amount of permits.
///
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
/// behave like [`futures::future::pending`], just waiting until new permits are added.
pub fn new(initial_permits: NonZeroUsize) -> Self {
ConfigurableSemaphore {
initial_permits,
inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())),
}
}
}
impl Default for ConfigurableSemaphore {
fn default() -> Self {
Self::new(Self::DEFAULT_INITIAL)
}
}
impl PartialEq for ConfigurableSemaphore {
fn eq(&self, other: &Self) -> bool {
// the number of permits can be increased at runtime, so we cannot really fulfill the
// PartialEq value equality otherwise
self.initial_permits == other.initial_permits
}
}
impl Eq for ConfigurableSemaphore {}
impl ConfigurableSemaphore {
pub fn inner(&self) -> &std::sync::Arc<tokio::sync::Semaphore> {
&self.inner
}
}
#[cfg(test)]
mod tests {
use std::{
@@ -783,8 +652,6 @@ max_file_descriptors = 333
initial_superuser_name = 'zzzz'
id = 10
log_format = 'json'
"#;
#[test]
@@ -824,8 +691,6 @@ log_format = 'json'
.parse()
.expect("Failed to parse a valid broker endpoint URL")],
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
},
"Correct defaults should be used when no config values are provided"
);
@@ -870,8 +735,6 @@ log_format = 'json'
.parse()
.expect("Failed to parse a valid broker endpoint URL")],
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::Json,
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -354,54 +354,6 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/size:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
get:
description: |
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
responses:
"200":
description: OK,
content:
application/json:
schema:
type: object
required:
- id
- size
properties:
id:
type: string
format: hex
size:
type: integer
description: |
Size metric in bytes.
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/timeline/:
parameters:
- name: tenant_id
@@ -666,7 +618,6 @@ components:
- last_record_lsn
- disk_consistent_lsn
- awaits_download
- state
properties:
timeline_id:
type: string
@@ -709,8 +660,6 @@ components:
type: integer
awaits_download:
type: boolean
state:
type: string
# These 'local' and 'remote' fields just duplicate some of the fields
# above. They are kept for backwards-compatibility. They can be removed,

View File

@@ -129,7 +129,6 @@ async fn build_timeline_info(
}
};
let current_physical_size = Some(timeline.get_physical_size());
let state = timeline.current_state();
let info = TimelineInfo {
tenant_id: timeline.tenant_id,
@@ -159,7 +158,6 @@ async fn build_timeline_info(
remote_consistent_lsn,
awaits_download,
state,
// Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
// with the control plane.
@@ -227,10 +225,13 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
let state = get_state(&request);
let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| {
let timelines = tokio::task::spawn_blocking(move || {
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
Ok(tenant.list_timelines())
})?;
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
let mut response_data = Vec::with_capacity(timelines.len());
for timeline in timelines {
@@ -293,7 +294,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
let timeline_info = async {
let timeline = tokio::task::spawn_blocking(move || {
tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id, false)
tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id)
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
@@ -330,13 +331,14 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
let timeline = tenant_mgr::get_tenant(tenant_id, true)
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
.and_then(|tenant| tenant.get_timeline(timeline_id))
.with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
.map_err(ApiError::NotFound)?;
let result = match timeline
.find_lsn_for_timestamp(timestamp_pg)
.map_err(ApiError::InternalServerError)?
{
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
LsnForTimestamp::Present(lsn) => format!("{}", lsn),
LsnForTimestamp::Future(_lsn) => "future".into(),
LsnForTimestamp::Past(_lsn) => "past".into(),
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
@@ -520,7 +522,9 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
check_permission(&request, Some(tenant_id))?;
// if tenant is in progress of downloading it can be absent in global tenant map
let tenant = tenant_mgr::get_tenant(tenant_id, false);
let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false))
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
let state = get_state(&request);
let remote_index = &state.remote_index;
@@ -566,44 +570,6 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
)
}
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::InternalServerError)?;
// this can be long operation, it currently is not backed by any request coalescing or similar
let inputs = tenant
.gather_size_inputs()
.await
.map_err(ApiError::InternalServerError)?;
let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
/// Private response type with the additional "unstable" `inputs` field.
///
/// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
/// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
#[serde_with::serde_as]
#[derive(serde::Serialize)]
struct TenantHistorySize {
#[serde_as(as = "serde_with::DisplayFromStr")]
id: TenantId,
/// Size is a mixture of WAL and logical size, so the unit is bytes.
size: u64,
inputs: crate::tenant::size::ModelInputs,
}
json_response(
StatusCode::OK,
TenantHistorySize {
id: tenant_id,
size,
inputs,
},
)
}
// Helper function to standardize the error messages we produce on bad durations
//
// Intended to be used with anyhow's `with_context`, e.g.:
@@ -618,7 +584,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
check_permission(&request, None)?;
let request_data: TenantCreateRequest = json_request(&mut request).await?;
println!("tenant create: {:?}", request_data.trace_read_requests);
let remote_index = get_state(&request).remote_index.clone();
let mut tenant_conf = TenantConfOpt::default();
@@ -660,9 +625,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
}
if let Some(trace_read_requests) = request_data.trace_read_requests {
tenant_conf.trace_read_requests = Some(trace_read_requests);
}
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
@@ -750,9 +712,6 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
}
if let Some(trace_read_requests) = request_data.trace_read_requests {
tenant_conf.trace_read_requests = Some(trace_read_requests);
}
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
@@ -822,6 +781,11 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
}
// Run GC immediately on given timeline.
// FIXME: This is just for tests. See test_runner/regress/test_gc.py.
// This probably should require special authentication or a global flag to
// enable, I don't think we want to or need to allow regular clients to invoke
// GC.
// @hllinnaka in commits ec44f4b29, 3aca717f3
#[cfg(feature = "testing")]
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
@@ -829,17 +793,17 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
check_permission(&request, Some(tenant_id))?;
// FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
let repo = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
let _span_guard =
info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| repo.get_gc_horizon());
// Use tenant's pitr setting
let pitr = tenant.get_pitr_interval();
let result = tenant
let pitr = repo.get_pitr_interval();
let result = repo
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
.await
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
// better once the types support it.
.map_err(ApiError::InternalServerError)?;
@@ -847,15 +811,19 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
}
// Run compaction immediately on given timeline.
// FIXME This is just for tests. Don't expect this to be exposed to
// the users or the api.
// @dhammika in commit a0781f229
#[cfg(feature = "testing")]
async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
let timeline = repo
.get_timeline(timeline_id)
.with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
.map_err(ApiError::NotFound)?;
timeline.compact().map_err(ApiError::InternalServerError)?;
@@ -869,13 +837,13 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
let timeline = repo
.get_timeline(timeline_id)
.with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
.map_err(ApiError::NotFound)?;
timeline
.checkpoint(CheckpointConfig::Forced)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
@@ -939,7 +907,6 @@ pub fn make_router(
.get("/v1/tenant", tenant_list_handler)
.post("/v1/tenant", tenant_create_handler)
.get("/v1/tenant/:tenant_id", tenant_status)
.get("/v1/tenant/:tenant_id/size", tenant_size_handler)
.put("/v1/tenant/config", tenant_config_handler)
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)

View File

@@ -12,10 +12,10 @@ use tracing::*;
use walkdir::WalkDir;
use crate::pgdatadir_mapping::*;
use crate::reltag::{RelTag, SlruKind};
use crate::tenant::Timeline;
use crate::walingest::WalIngest;
use crate::walrecord::DecodedWALRecord;
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::*;
use postgres_ffi::waldecoder::WalStreamDecoder;
@@ -43,19 +43,19 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
/// The code that deals with the checkpoint would not work right if the
/// cluster was not shut down cleanly.
pub fn import_timeline_from_postgres_datadir(
path: &Path,
tline: &Timeline,
pgdata_path: &Path,
pgdata_lsn: Lsn,
lsn: Lsn,
) -> Result<()> {
let mut pg_control: Option<ControlFileData> = None;
// TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn)
// Then fishing out pg_control would be unnecessary
let mut modification = tline.begin_modification(pgdata_lsn);
let mut modification = tline.begin_modification(lsn);
modification.init_empty()?;
// Import all but pg_wal
let all_but_wal = WalkDir::new(pgdata_path)
let all_but_wal = WalkDir::new(path)
.into_iter()
.filter_entry(|entry| !entry.path().ends_with("pg_wal"));
for entry in all_but_wal {
@@ -63,7 +63,7 @@ pub fn import_timeline_from_postgres_datadir(
let metadata = entry.metadata().expect("error getting dir entry metadata");
if metadata.is_file() {
let absolute_path = entry.path();
let relative_path = absolute_path.strip_prefix(pgdata_path)?;
let relative_path = absolute_path.strip_prefix(path)?;
let file = File::open(absolute_path)?;
let len = metadata.len() as usize;
@@ -84,7 +84,7 @@ pub fn import_timeline_from_postgres_datadir(
"Postgres cluster was not shut down cleanly"
);
ensure!(
pg_control.checkPointCopy.redo == pgdata_lsn.0,
pg_control.checkPointCopy.redo == lsn.0,
"unexpected checkpoint REDO pointer"
);
@@ -92,10 +92,10 @@ pub fn import_timeline_from_postgres_datadir(
// this reads the checkpoint record itself, advancing the tip of the timeline to
// *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'.
import_wal(
&pgdata_path.join("pg_wal"),
&path.join("pg_wal"),
tline,
Lsn(pg_control.checkPointCopy.redo),
pgdata_lsn,
lsn,
)?;
Ok(())

View File

@@ -8,6 +8,7 @@ pub mod page_cache;
pub mod page_service;
pub mod pgdatadir_mapping;
pub mod profiling;
pub mod reltag;
pub mod repository;
pub mod storage_sync;
pub mod task_mgr;
@@ -15,7 +16,6 @@ pub mod tenant;
pub mod tenant_config;
pub mod tenant_mgr;
pub mod tenant_tasks;
pub mod trace;
pub mod virtual_file;
pub mod walingest;
pub mod walreceiver;
@@ -44,7 +44,7 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
pub const LOG_FILE_NAME: &str = "pageserver.log";
/// Config for the Repository checkpointer
#[derive(Debug, Clone, Copy)]
@@ -80,6 +80,7 @@ pub async fn shutdown_pageserver(exit_code: i32) {
// There should be nothing left, but let's be sure
task_mgr::shutdown_tasks(None, None, None).await;
info!("Shut down successfully completed");
std::process::exit(exit_code);
}

View File

@@ -31,7 +31,6 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
"compact",
"create images",
"init logical size",
"logical size",
"load layer map",
"gc",
];
@@ -366,7 +365,6 @@ pub struct TimelineMetrics {
pub compact_time_histo: Histogram,
pub create_images_time_histo: Histogram,
pub init_logical_size_histo: Histogram,
pub logical_size_histo: Histogram,
pub load_layer_map_histo: Histogram,
pub last_record_gauge: IntGauge,
pub wait_lsn_time_histo: Histogram,
@@ -399,9 +397,6 @@ impl TimelineMetrics {
let init_logical_size_histo = STORAGE_TIME
.get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
.unwrap();
let logical_size_histo = STORAGE_TIME
.get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id])
.unwrap();
let load_layer_map_histo = STORAGE_TIME
.get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
.unwrap();
@@ -433,7 +428,6 @@ impl TimelineMetrics {
compact_time_histo,
create_images_time_histo,
init_logical_size_histo,
logical_size_histo,
load_layer_map_histo,
last_record_gauge,
wait_lsn_time_histo,

View File

@@ -36,8 +36,9 @@
//! mapping is automatically removed and the slot is marked free.
//!
use dashmap::mapref::entry::Entry;
use dashmap::DashMap;
use std::{
collections::{hash_map::Entry, HashMap},
convert::TryInto,
sync::{
atomic::{AtomicU8, AtomicUsize, Ordering},
@@ -168,18 +169,11 @@ impl Slot {
pub struct PageCache {
/// This contains the mapping from the cache key to buffer slot that currently
/// contains the page, if any.
///
/// TODO: This is protected by a single lock. If that becomes a bottleneck,
/// this HashMap can be replaced with a more concurrent version, there are
/// plenty of such crates around.
///
/// If you add support for caching different kinds of objects, each object kind
/// can have a separate mapping map, next to this field.
materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
materialized_page_map: DashMap<MaterializedPageHashKey, Vec<Version>>,
ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
ephemeral_page_map: DashMap<(u64, u32), usize>,
immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,
immutable_page_map: DashMap<(u64, u32), usize>,
/// The actual buffers with their metadata.
slots: Box<[Slot]>,
@@ -616,7 +610,7 @@ impl PageCache {
fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
match cache_key {
CacheKey::MaterializedPage { hash_key, lsn } => {
let map = self.materialized_page_map.read().unwrap();
let map = &self.materialized_page_map;
let versions = map.get(hash_key)?;
let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
@@ -629,11 +623,11 @@ impl PageCache {
Some(version.slot_idx)
}
CacheKey::EphemeralPage { file_id, blkno } => {
let map = self.ephemeral_page_map.read().unwrap();
let map = &self.ephemeral_page_map;
Some(*map.get(&(*file_id, *blkno))?)
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let map = self.immutable_page_map.read().unwrap();
let map = &self.immutable_page_map;
Some(*map.get(&(*file_id, *blkno))?)
}
}
@@ -646,7 +640,7 @@ impl PageCache {
fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
match key {
CacheKey::MaterializedPage { hash_key, lsn } => {
let map = self.materialized_page_map.read().unwrap();
let map = &self.materialized_page_map;
let versions = map.get(hash_key)?;
if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
@@ -656,11 +650,11 @@ impl PageCache {
}
}
CacheKey::EphemeralPage { file_id, blkno } => {
let map = self.ephemeral_page_map.read().unwrap();
let map = &self.ephemeral_page_map;
Some(*map.get(&(*file_id, *blkno))?)
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let map = self.immutable_page_map.read().unwrap();
let map = &self.immutable_page_map;
Some(*map.get(&(*file_id, *blkno))?)
}
}
@@ -675,7 +669,7 @@ impl PageCache {
hash_key: old_hash_key,
lsn: old_lsn,
} => {
let mut map = self.materialized_page_map.write().unwrap();
let map = &self.materialized_page_map;
if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
let versions = old_entry.get_mut();
@@ -690,12 +684,12 @@ impl PageCache {
}
}
CacheKey::EphemeralPage { file_id, blkno } => {
let mut map = self.ephemeral_page_map.write().unwrap();
let map = &self.ephemeral_page_map;
map.remove(&(*file_id, *blkno))
.expect("could not find old key in mapping");
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let mut map = self.immutable_page_map.write().unwrap();
let map = &self.immutable_page_map;
map.remove(&(*file_id, *blkno))
.expect("could not find old key in mapping");
}
@@ -713,8 +707,8 @@ impl PageCache {
hash_key: new_key,
lsn: new_lsn,
} => {
let mut map = self.materialized_page_map.write().unwrap();
let versions = map.entry(new_key.clone()).or_default();
let map = &self.materialized_page_map;
let mut versions = map.entry(new_key.clone()).or_default();
match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
Ok(version_idx) => Some(versions[version_idx].slot_idx),
Err(version_idx) => {
@@ -730,7 +724,7 @@ impl PageCache {
}
}
CacheKey::EphemeralPage { file_id, blkno } => {
let mut map = self.ephemeral_page_map.write().unwrap();
let map = &self.ephemeral_page_map;
match map.entry((*file_id, *blkno)) {
Entry::Occupied(entry) => Some(*entry.get()),
Entry::Vacant(entry) => {
@@ -740,7 +734,7 @@ impl PageCache {
}
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let mut map = self.immutable_page_map.write().unwrap();
let map = &self.immutable_page_map;
match map.entry((*file_id, *blkno)) {
Entry::Occupied(entry) => Some(*entry.get()),
Entry::Vacant(entry) => {

View File

@@ -10,50 +10,198 @@
//
use anyhow::{bail, ensure, Context, Result};
use bytes::Buf;
use bytes::Bytes;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use futures::{Stream, StreamExt};
use pageserver_api::models::{
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
PagestreamNblocksRequest, PagestreamNblocksResponse,
};
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
use std::io;
use std::net::TcpListener;
use std::str;
use std::str::FromStr;
use std::sync::Arc;
use tokio::pin;
use tokio_util::io::StreamReader;
use tokio_util::io::SyncIoBridge;
use tracing::*;
use utils::id::ConnectionId;
use utils::{
auth::{self, Claims, JwtAuth, Scope},
id::{TenantId, TimelineId},
lsn::Lsn,
postgres_backend::AuthType,
postgres_backend_async::{self, PostgresBackend},
pq_proto::{BeMessage, FeMessage, RowDescriptor},
simple_rcu::RcuReadGuard,
};
use crate::basebackup;
use crate::config::{PageServerConf, ProfilingConfig};
use crate::import_datadir::import_wal_from_tar;
use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
use crate::profiling::profpoint_start;
use crate::reltag::RelTag;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::Timeline;
use crate::tenant_mgr;
use crate::trace::Tracer;
use crate::CheckpointConfig;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
// Wrapped in libpq CopyData
enum PagestreamFeMessage {
Exists(PagestreamExistsRequest),
Nblocks(PagestreamNblocksRequest),
GetPage(PagestreamGetPageRequest),
DbSize(PagestreamDbSizeRequest),
}
// Wrapped in libpq CopyData
enum PagestreamBeMessage {
Exists(PagestreamExistsResponse),
Nblocks(PagestreamNblocksResponse),
GetPage(PagestreamGetPageResponse),
Error(PagestreamErrorResponse),
DbSize(PagestreamDbSizeResponse),
}
#[derive(Debug)]
struct PagestreamExistsRequest {
latest: bool,
lsn: Lsn,
rel: RelTag,
}
#[derive(Debug)]
struct PagestreamNblocksRequest {
latest: bool,
lsn: Lsn,
rel: RelTag,
}
#[derive(Debug)]
struct PagestreamGetPageRequest {
latest: bool,
lsn: Lsn,
rel: RelTag,
blkno: u32,
}
#[derive(Debug)]
struct PagestreamDbSizeRequest {
latest: bool,
lsn: Lsn,
dbnode: u32,
}
#[derive(Debug)]
struct PagestreamExistsResponse {
exists: bool,
}
#[derive(Debug)]
struct PagestreamNblocksResponse {
n_blocks: u32,
}
#[derive(Debug)]
struct PagestreamGetPageResponse {
page: Bytes,
}
#[derive(Debug)]
struct PagestreamErrorResponse {
message: String,
}
#[derive(Debug)]
struct PagestreamDbSizeResponse {
db_size: i64,
}
impl PagestreamFeMessage {
fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
// TODO these gets can fail
// these correspond to the NeonMessageTag enum in pagestore_client.h
//
// TODO: consider using protobuf or serde bincode for less error prone
// serialization.
let msg_tag = body.get_u8();
match msg_tag {
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
latest: body.get_u8() != 0,
lsn: Lsn::from(body.get_u64()),
rel: RelTag {
spcnode: body.get_u32(),
dbnode: body.get_u32(),
relnode: body.get_u32(),
forknum: body.get_u8(),
},
})),
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
latest: body.get_u8() != 0,
lsn: Lsn::from(body.get_u64()),
rel: RelTag {
spcnode: body.get_u32(),
dbnode: body.get_u32(),
relnode: body.get_u32(),
forknum: body.get_u8(),
},
})),
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
latest: body.get_u8() != 0,
lsn: Lsn::from(body.get_u64()),
rel: RelTag {
spcnode: body.get_u32(),
dbnode: body.get_u32(),
relnode: body.get_u32(),
forknum: body.get_u8(),
},
blkno: body.get_u32(),
})),
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
latest: body.get_u8() != 0,
lsn: Lsn::from(body.get_u64()),
dbnode: body.get_u32(),
})),
_ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
}
}
}
impl PagestreamBeMessage {
fn serialize(&self) -> Bytes {
let mut bytes = BytesMut::new();
match self {
Self::Exists(resp) => {
bytes.put_u8(100); /* tag from pagestore_client.h */
bytes.put_u8(resp.exists as u8);
}
Self::Nblocks(resp) => {
bytes.put_u8(101); /* tag from pagestore_client.h */
bytes.put_u32(resp.n_blocks);
}
Self::GetPage(resp) => {
bytes.put_u8(102); /* tag from pagestore_client.h */
bytes.put(&resp.page[..]);
}
Self::Error(resp) => {
bytes.put_u8(103); /* tag from pagestore_client.h */
bytes.put(resp.message.as_bytes());
bytes.put_u8(0); // null terminator
}
Self::DbSize(resp) => {
bytes.put_u8(104); /* tag from pagestore_client.h */
bytes.put_i64(resp.db_size);
}
}
bytes.into()
}
}
fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
async_stream::try_stream! {
loop {
@@ -76,12 +224,6 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
FeMessage::CopyData(bytes) => bytes,
FeMessage::CopyDone => { break },
FeMessage::Sync => continue,
FeMessage::Terminate => {
let msg = format!("client terminated connection with Terminate message during COPY");
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
break;
}
m => {
let msg = format!("unexpected message {:?}", m);
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
@@ -93,10 +235,10 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
yield copy_data_bytes;
}
Ok(None) => {
let msg = "client closed connection during COPY";
let msg = "client closed connection";
pgb.write_message(&BeMessage::ErrorResponse(msg))?;
pgb.flush().await?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
Err(io::Error::new(io::ErrorKind::Other, msg))?;
}
Err(e) => {
Err(io::Error::new(io::ErrorKind::Other, e))?;
@@ -277,18 +419,6 @@ impl PageServerHandler {
// so there is no need to reset the association
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
// Make request tracer if needed
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
let mut tracer = if tenant.get_trace_read_requests() {
let connection_id = ConnectionId::generate();
let path = tenant
.conf
.trace_path(&tenant_id, &timeline_id, &connection_id);
Some(Tracer::new(path))
} else {
None
};
// Check that the timeline exists
let timeline = get_local_timeline(tenant_id, timeline_id)?;
@@ -321,12 +451,7 @@ impl PageServerHandler {
trace!("query: {copy_data_bytes:?}");
// Trace request if needed
if let Some(t) = tracer.as_mut() {
t.trace(&copy_data_bytes)
}
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
let neon_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
let response = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
@@ -375,8 +500,11 @@ impl PageServerHandler {
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
// Create empty timeline
info!("creating new timeline");
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?;
let timeline = tenant_mgr::get_tenant(tenant_id, true)?.create_empty_timeline(
timeline_id,
base_lsn,
pg_version,
)?;
// TODO mark timeline as not ready until it reaches end_lsn.
// We might have some wal to import as well, and we should prevent compute
@@ -393,12 +521,13 @@ impl PageServerHandler {
pgb.write_message(&BeMessage::CopyInResponse)?;
pgb.flush().await?;
let copyin_stream = copyin_stream(pgb);
pin!(copyin_stream);
timeline
.import_basebackup_from_tar(&mut copyin_stream, base_lsn)
.await?;
// import_basebackup_from_tar() is not async, mainly because the Tar crate
// it uses is not async. So we need to jump through some hoops:
// - convert the input from client connection to a synchronous Read
// - use block_in_place()
let mut copyin_stream = Box::pin(copyin_stream(pgb));
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
tokio::task::block_in_place(|| import_basebackup_from_tar(&timeline, reader, base_lsn))?;
// Drain the rest of the Copy data
let mut bytes_after_tar = 0;
@@ -415,6 +544,12 @@ impl PageServerHandler {
// It wouldn't work if base came from vanilla postgres though,
// since we discard some log files.
// Flush data to disk, then upload to s3
info!("flushing layers");
timeline.checkpoint(CheckpointConfig::Flush)?;
timeline.launch_wal_receiver()?;
info!("done");
Ok(())
}
@@ -463,7 +598,7 @@ impl PageServerHandler {
// We only want to persist the data, and it doesn't matter if it's in the
// shape of deltas or images.
info!("flushing layers");
timeline.checkpoint(CheckpointConfig::Flush).await?;
timeline.checkpoint(CheckpointConfig::Flush)?;
info!("done");
Ok(())
@@ -933,8 +1068,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
}
fn get_local_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> Result<Arc<Timeline>> {
tenant_mgr::get_tenant(tenant_id, true)
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id))
}
///

View File

@@ -7,12 +7,12 @@
//! Clarify that)
//!
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::reltag::{RelTag, SlruKind};
use crate::repository::*;
use crate::tenant::Timeline;
use crate::walrecord::NeonWalRecord;
use anyhow::{bail, ensure, Result};
use bytes::{Buf, Bytes};
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
use postgres_ffi::{Oid, TimestampTz, TransactionId};
@@ -1373,17 +1373,6 @@ fn is_rel_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0
}
pub fn is_rel_fsm_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
}
pub fn is_rel_vm_block_key(key: Key) -> bool {
key.field1 == 0x00
&& key.field4 != 0
&& key.field5 == VISIBILITYMAP_FORKNUM
&& key.field6 != 0xffffffff
}
pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
Ok(match key.field1 {
0x01 => {
@@ -1414,9 +1403,7 @@ pub fn create_test_timeline(
timeline_id: utils::id::TimelineId,
pg_version: u32,
) -> Result<std::sync::Arc<Timeline>> {
let tline = tenant
.create_empty_timeline(timeline_id, Lsn(8), pg_version)?
.initialize()?;
let tline = tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version)?;
let mut m = tline.begin_modification(Lsn(8));
m.init_empty()?;
m.commit()?;

View File

@@ -22,7 +22,7 @@ use crate::{
TEMP_FILE_SUFFIX,
};
use utils::{
crashsafe::path_with_suffix_extension,
crashsafe_dir::path_with_suffix_extension,
id::{TenantId, TenantTimelineId, TimelineId},
};

File diff suppressed because it is too large Load Diff

View File

@@ -74,7 +74,6 @@ where
};
dstbuf.clear();
dstbuf.reserve(len);
// Read the payload
let mut remain = len;

View File

@@ -260,9 +260,8 @@ impl Layer for DeltaLayer {
// Ok, 'offsets' now contains the offsets of all the entries we need to read
let mut cursor = file.block_cursor();
let mut buf = Vec::new();
for (entry_lsn, pos) in offsets {
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
let buf = cursor.read_blob(pos).with_context(|| {
format!(
"Failed to read blob from virtual file {}",
file.file.path.display()
@@ -611,9 +610,9 @@ impl DeltaLayer {
///
/// 3. Call `finish`.
///
struct DeltaLayerWriterInner {
pub struct DeltaLayerWriter {
conf: &'static PageServerConf,
pub path: PathBuf,
path: PathBuf,
timeline_id: TimelineId,
tenant_id: TenantId,
@@ -625,17 +624,17 @@ struct DeltaLayerWriterInner {
blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
}
impl DeltaLayerWriterInner {
impl DeltaLayerWriter {
///
/// Start building a new delta layer.
///
fn new(
pub fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
key_start: Key,
lsn_range: Range<Lsn>,
) -> anyhow::Result<Self> {
) -> Result<DeltaLayerWriter> {
// Create the file initially with a temporary filename. We don't know
// the end key yet, so we cannot form the final filename yet. We will
// rename it when we're done.
@@ -654,7 +653,7 @@ impl DeltaLayerWriterInner {
let block_buf = BlockBuf::new();
let tree_builder = DiskBtreeBuilder::new(block_buf);
Ok(Self {
Ok(DeltaLayerWriter {
conf,
path,
timeline_id,
@@ -671,17 +670,17 @@ impl DeltaLayerWriterInner {
///
/// The values must be appended in key, lsn order.
///
fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
}
fn put_value_bytes(
pub fn put_value_bytes(
&mut self,
key: Key,
lsn: Lsn,
val: &[u8],
will_init: bool,
) -> anyhow::Result<()> {
) -> Result<()> {
assert!(self.lsn_range.start <= lsn);
let off = self.blob_writer.write_blob(val)?;
@@ -694,14 +693,14 @@ impl DeltaLayerWriterInner {
Ok(())
}
fn size(&self) -> u64 {
pub fn size(&self) -> u64 {
self.blob_writer.size() + self.tree.borrow_writer().size()
}
///
/// Finish writing the delta layer.
///
fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -769,102 +768,6 @@ impl DeltaLayerWriterInner {
}
}
/// A builder object for constructing a new delta layer.
///
/// Usage:
///
/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
///
/// 2. Write the contents by calling `put_value` for every page
/// version to store in the layer.
///
/// 3. Call `finish`.
///
/// # Note
///
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
/// possible for the writer to drop before `finish` is actually called. So this
/// could lead to odd temporary files in the directory, exhausting file system.
/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
/// implementation that cleans up the temporary file in failure. It's not
/// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves
/// out some fields, making it impossible to implement `Drop`.
///
#[must_use]
pub struct DeltaLayerWriter {
inner: Option<DeltaLayerWriterInner>,
}
impl DeltaLayerWriter {
///
/// Start building a new delta layer.
///
pub fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
key_start: Key,
lsn_range: Range<Lsn>,
) -> anyhow::Result<Self> {
Ok(Self {
inner: Some(DeltaLayerWriterInner::new(
conf,
timeline_id,
tenant_id,
key_start,
lsn_range,
)?),
})
}
///
/// Append a key-value pair to the file.
///
/// The values must be appended in key, lsn order.
///
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_value(key, lsn, val)
}
pub fn put_value_bytes(
&mut self,
key: Key,
lsn: Lsn,
val: &[u8],
will_init: bool,
) -> anyhow::Result<()> {
self.inner
.as_mut()
.unwrap()
.put_value_bytes(key, lsn, val, will_init)
}
pub fn size(&self) -> u64 {
self.inner.as_ref().unwrap().size()
}
///
/// Finish writing the delta layer.
///
pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
self.inner.take().unwrap().finish(key_end)
}
}
impl Drop for DeltaLayerWriter {
fn drop(&mut self) {
if let Some(inner) = self.inner.take() {
match inner.blob_writer.into_inner().into_inner() {
Ok(vfile) => vfile.remove(),
Err(err) => warn!(
"error while flushing buffer of image layer temporary file: {}",
err
),
}
}
}
}
///
/// Iterator over all key-value pairse stored in a delta layer
///

View File

@@ -411,7 +411,7 @@ impl ImageLayer {
///
/// 3. Call `finish`.
///
struct ImageLayerWriterInner {
pub struct ImageLayerWriter {
conf: &'static PageServerConf,
path: PathBuf,
timeline_id: TimelineId,
@@ -423,17 +423,14 @@ struct ImageLayerWriterInner {
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
}
impl ImageLayerWriterInner {
///
/// Start building a new image layer.
///
fn new(
impl ImageLayerWriter {
pub fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
key_range: &Range<Key>,
lsn: Lsn,
) -> anyhow::Result<Self> {
) -> anyhow::Result<ImageLayerWriter> {
// Create the file initially with a temporary filename.
// We'll atomically rename it to the final name when we're done.
let path = ImageLayer::temp_path_for(
@@ -458,7 +455,7 @@ impl ImageLayerWriterInner {
let block_buf = BlockBuf::new();
let tree_builder = DiskBtreeBuilder::new(block_buf);
let writer = Self {
let writer = ImageLayerWriter {
conf,
path,
timeline_id,
@@ -477,7 +474,7 @@ impl ImageLayerWriterInner {
///
/// The page versions must be appended in blknum order.
///
fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
ensure!(self.key_range.contains(&key));
let off = self.blob_writer.write_blob(img)?;
@@ -488,10 +485,7 @@ impl ImageLayerWriterInner {
Ok(())
}
///
/// Finish writing the image layer.
///
fn finish(self) -> anyhow::Result<ImageLayer> {
pub fn finish(self) -> anyhow::Result<ImageLayer> {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -558,76 +552,3 @@ impl ImageLayerWriterInner {
Ok(layer)
}
}
/// A builder object for constructing a new image layer.
///
/// Usage:
///
/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
///
/// 2. Write the contents by calling `put_page_image` for every key-value
/// pair in the key range.
///
/// 3. Call `finish`.
///
/// # Note
///
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
/// possible for the writer to drop before `finish` is actually called. So this
/// could lead to odd temporary files in the directory, exhausting file system.
/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
/// implementation that cleans up the temporary file in failure. It's not
/// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
/// out some fields, making it impossible to implement `Drop`.
///
#[must_use]
pub struct ImageLayerWriter {
inner: Option<ImageLayerWriterInner>,
}
impl ImageLayerWriter {
///
/// Start building a new image layer.
///
pub fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
key_range: &Range<Key>,
lsn: Lsn,
) -> anyhow::Result<ImageLayerWriter> {
Ok(Self {
inner: Some(ImageLayerWriterInner::new(
conf,
timeline_id,
tenant_id,
key_range,
lsn,
)?),
})
}
///
/// Write next value to the file.
///
/// The page versions must be appended in blknum order.
///
pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_image(key, img)
}
///
/// Finish writing the image layer.
///
pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
self.inner.take().unwrap().finish()
}
}
impl Drop for ImageLayerWriter {
fn drop(&mut self) {
if let Some(inner) = self.inner.take() {
inner.blob_writer.into_inner().remove();
}
}
}

View File

@@ -1,475 +0,0 @@
use std::cmp;
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use anyhow::Context;
use tokio::sync::Semaphore;
use super::Tenant;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use tracing::*;
/// Inputs to the actual tenant sizing model
///
/// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to
/// be a transferrable format between execution environments and developer.
#[serde_with::serde_as]
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct ModelInputs {
updates: Vec<Update>,
retention_period: u64,
#[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
timeline_inputs: HashMap<TimelineId, TimelineInputs>,
}
/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
/// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
#[serde_with::serde_as]
#[derive(Debug, serde::Serialize, serde::Deserialize)]
struct TimelineInputs {
#[serde_as(as = "serde_with::DisplayFromStr")]
last_record: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
latest_gc_cutoff: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
horizon_cutoff: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
pitr_cutoff: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
next_gc_cutoff: Lsn,
}
/// Gathers the inputs for the tenant sizing model.
///
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
/// is updated on-demand, during the start of this calculation and separate from the
/// [`Timeline::latest_gc_cutoff`].
///
/// For timelines in general:
///
/// ```ignore
/// 0-----|---------|----|------------| · · · · · |·> lsn
/// initdb_lsn branchpoints* next_gc_cutoff latest
/// ```
///
/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
/// tenant size will be zero.
pub(super) async fn gather_inputs(
tenant: &Tenant,
limit: &Arc<Semaphore>,
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
) -> anyhow::Result<ModelInputs> {
// with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
// our advantage with `?` error handling.
let mut joinset = tokio::task::JoinSet::new();
let timelines = tenant
.refresh_gc_info()
.context("Failed to refresh gc_info before gathering inputs")?;
if timelines.is_empty() {
// All timelines are below tenant's gc_horizon; alternative would be to use
// Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
// missing GcInfo::retain_lsns or having obsolete values for cutoff's.
return Ok(ModelInputs {
updates: vec![],
retention_period: 0,
timeline_inputs: HashMap::new(),
});
}
// record the used/inserted cache keys here, to remove extras not to start leaking
// after initial run the cache should be quite stable, but live timelines will eventually
// require new lsns to be inspected.
let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new();
let mut updates = Vec::new();
// record the per timline values used to determine `retention_period`
let mut timeline_inputs = HashMap::with_capacity(timelines.len());
// used to determine the `retention_period` for the size model
let mut max_cutoff_distance = None;
// this will probably conflict with on-demand downloaded layers, or at least force them all
// to be downloaded
for timeline in timelines {
let last_record_lsn = timeline.get_last_record_lsn();
let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
// there's a race between the update (holding tenant.gc_lock) and this read but it
// might not be an issue, because it's not for Timeline::gc
let gc_info = timeline.gc_info.read().unwrap();
// similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
// new gc run, which we have no control over. however differently from `Timeline::gc`
// we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
// actually removing files.
let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
// the minimum where we should find the next_gc_cutoff for our calculations.
//
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
// want to query any logical size before initdb_lsn.
let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn);
let maybe_cutoff = if next_gc_cutoff > cutoff_minimum {
Some((next_gc_cutoff, LsnKind::GcCutOff))
} else {
None
};
// this assumes there are no other lsns than the branchpoints
let lsns = gc_info
.retain_lsns
.iter()
.inspect(|&&lsn| {
trace!(
timeline_id=%timeline.timeline_id,
"retained lsn: {lsn:?}, is_before_ancestor_lsn={}",
lsn < timeline.get_ancestor_lsn()
)
})
.filter(|&&lsn| lsn > timeline.get_ancestor_lsn())
.copied()
.map(|lsn| (lsn, LsnKind::BranchPoint))
.chain(maybe_cutoff)
.collect::<Vec<_>>();
(
lsns,
gc_info.horizon_cutoff,
gc_info.pitr_cutoff,
next_gc_cutoff,
)
};
// update this to have a retention_period later for the tenant_size_model
// tenant_size_model compares this to the last segments start_lsn
if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) {
match max_cutoff_distance.as_mut() {
Some(max) => {
*max = std::cmp::max(*max, cutoff_distance);
}
_ => {
max_cutoff_distance = Some(cutoff_distance);
}
}
}
// all timelines branch from something, because it might be impossible to pinpoint
// which is the tenant_size_model's "default" branch.
updates.push(Update {
lsn: timeline.get_ancestor_lsn(),
command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
timeline_id: timeline.timeline_id,
});
for (lsn, _kind) in &interesting_lsns {
if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
updates.push(Update {
lsn: *lsn,
timeline_id: timeline.timeline_id,
command: Command::Update(*size),
});
needed_cache.insert((timeline.timeline_id, *lsn));
} else {
let timeline = Arc::clone(&timeline);
let parallel_size_calcs = Arc::clone(limit);
joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn));
}
}
timeline_inputs.insert(
timeline.timeline_id,
TimelineInputs {
last_record: last_record_lsn,
// this is not used above, because it might not have updated recently enough
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
horizon_cutoff,
pitr_cutoff,
next_gc_cutoff,
},
);
}
let mut have_any_error = false;
while let Some(res) = joinset.join_next().await {
// each of these come with Result<Result<_, JoinError>, JoinError>
// because of spawn + spawn_blocking
let res = res.and_then(|inner| inner);
match res {
Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
logical_size_cache.insert((timeline.timeline_id, lsn), size);
needed_cache.insert((timeline.timeline_id, lsn));
updates.push(Update {
lsn,
timeline_id: timeline.timeline_id,
command: Command::Update(size),
});
}
Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
warn!(
timeline_id=%timeline.timeline_id,
"failed to calculate logical size at {lsn}: {error:#}"
);
have_any_error = true;
}
Err(join_error) if join_error.is_cancelled() => {
unreachable!("we are not cancelling any of the futures, nor should be");
}
Err(join_error) => {
// cannot really do anything, as this panic is likely a bug
error!("logical size query panicked: {join_error:#}");
have_any_error = true;
}
}
}
// prune any keys not needed anymore; we record every used key and added key.
logical_size_cache.retain(|key, _| needed_cache.contains(key));
if have_any_error {
// we cannot complete this round, because we are missing data.
// we have however cached all we were able to request calculation on.
anyhow::bail!("failed to calculate some logical_sizes");
}
// the data gathered to updates is per lsn, regardless of the branch, so we can use it to
// our advantage, not requiring a sorted container or graph walk.
//
// for branch points, which come as multiple updates at the same LSN, the Command::Update
// is needed before a branch is made out of that branch Command::BranchFrom. this is
// handled by the variant order in `Command`.
updates.sort_unstable();
let retention_period = match max_cutoff_distance {
Some(max) => max.0,
None => {
anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0")
}
};
Ok(ModelInputs {
updates,
retention_period,
timeline_inputs,
})
}
impl ModelInputs {
pub fn calculate(&self) -> anyhow::Result<u64> {
// Option<TimelineId> is used for "naming" the branches because it is assumed to be
// impossible to always determine the a one main branch.
let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);
// tracking these not to require modifying the current implementation of the size model,
// which works in relative LSNs and sizes.
let mut last_state: HashMap<TimelineId, (Lsn, u64)> = HashMap::new();
for update in &self.updates {
let Update {
lsn,
command: op,
timeline_id,
} = update;
match op {
Command::Update(sz) => {
let latest = last_state.get_mut(timeline_id).ok_or_else(|| {
anyhow::anyhow!(
"ordering-mismatch: there must had been a previous state for {timeline_id}"
)
})?;
let lsn_bytes = {
let Lsn(now) = lsn;
let Lsn(prev) = latest.0;
debug_assert!(prev <= *now, "self.updates should had been sorted");
now - prev
};
let size_diff =
i64::try_from(*sz as i128 - latest.1 as i128).with_context(|| {
format!("size difference i64 overflow for {timeline_id}")
})?;
storage.modify_branch(&Some(*timeline_id), "".into(), lsn_bytes, size_diff);
*latest = (*lsn, *sz);
}
Command::BranchFrom(parent) => {
storage.branch(parent, Some(*timeline_id));
let size = parent
.as_ref()
.and_then(|id| last_state.get(id))
.map(|x| x.1)
.unwrap_or(0);
last_state.insert(*timeline_id, (*lsn, size));
}
}
}
Ok(storage.calculate(self.retention_period).total_children())
}
}
/// Single size model update.
///
/// Sizing model works with relative increments over latest branch state.
/// Updates are absolute, so additional state needs to be tracked when applying.
#[serde_with::serde_as]
#[derive(
Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
)]
struct Update {
#[serde_as(as = "serde_with::DisplayFromStr")]
lsn: utils::lsn::Lsn,
command: Command,
#[serde_as(as = "serde_with::DisplayFromStr")]
timeline_id: TimelineId,
}
#[serde_with::serde_as]
#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "snake_case")]
enum Command {
Update(u64),
BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
}
impl std::fmt::Debug for Command {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3
// linebreaks
match self {
Self::Update(arg0) => write!(f, "Update({arg0})"),
Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
}
}
}
#[derive(Debug, Clone, Copy)]
enum LsnKind {
BranchPoint,
GcCutOff,
}
/// Newtype around the tuple that carries the timeline at lsn logical size calculation.
struct TimelineAtLsnSizeResult(
Arc<crate::tenant::Timeline>,
utils::lsn::Lsn,
anyhow::Result<u64>,
);
#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
async fn calculate_logical_size(
limit: Arc<tokio::sync::Semaphore>,
timeline: Arc<crate::tenant::Timeline>,
lsn: utils::lsn::Lsn,
) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
let permit = tokio::sync::Semaphore::acquire_owned(limit)
.await
.expect("global semaphore should not had been closed");
tokio::task::spawn_blocking(move || {
let _permit = permit;
let size_res = timeline.calculate_logical_size(lsn);
TimelineAtLsnSizeResult(timeline, lsn, size_res)
})
.await
}
#[test]
fn updates_sort() {
use std::str::FromStr;
use utils::id::TimelineId;
use utils::lsn::Lsn;
let ids = [
TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(),
TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(),
TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(),
];
// try through all permutations
let ids = [
[&ids[0], &ids[1], &ids[2]],
[&ids[0], &ids[2], &ids[1]],
[&ids[1], &ids[0], &ids[2]],
[&ids[1], &ids[2], &ids[0]],
[&ids[2], &ids[0], &ids[1]],
[&ids[2], &ids[1], &ids[0]],
];
for ids in ids {
// apply a fixture which uses a permutation of ids
let commands = [
Update {
lsn: Lsn(0),
command: Command::BranchFrom(None),
timeline_id: *ids[0],
},
Update {
lsn: Lsn::from_str("0/67E7618").unwrap(),
command: Command::Update(43696128),
timeline_id: *ids[0],
},
Update {
lsn: Lsn::from_str("0/67E7618").unwrap(),
command: Command::BranchFrom(Some(*ids[0])),
timeline_id: *ids[1],
},
Update {
lsn: Lsn::from_str("0/76BE4F0").unwrap(),
command: Command::Update(41844736),
timeline_id: *ids[1],
},
Update {
lsn: Lsn::from_str("0/10E49380").unwrap(),
command: Command::Update(42164224),
timeline_id: *ids[0],
},
Update {
lsn: Lsn::from_str("0/10E49380").unwrap(),
command: Command::BranchFrom(Some(*ids[0])),
timeline_id: *ids[2],
},
Update {
lsn: Lsn::from_str("0/11D74910").unwrap(),
command: Command::Update(42172416),
timeline_id: *ids[2],
},
Update {
lsn: Lsn::from_str("0/12051E98").unwrap(),
command: Command::Update(42196992),
timeline_id: *ids[0],
},
];
let mut sorted = commands;
// these must sort in the same order, regardless of how the ids sort
// which is why the timeline_id is the last field
sorted.sort_unstable();
assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted);
}
}
#[test]
fn verify_size_for_multiple_branches() {
// this is generated from integration test test_tenant_size_with_multiple_branches, but this way
// it has the stable lsn's
let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
assert_eq!(inputs.calculate().unwrap(), 36_409_872);
}

File diff suppressed because it is too large Load Diff

View File

@@ -82,7 +82,6 @@ pub struct TenantConf {
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
/// to avoid eager reconnects.
pub max_lsn_wal_lag: NonZeroU64,
pub trace_read_requests: bool,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -106,7 +105,6 @@ pub struct TenantConfOpt {
#[serde(with = "humantime_serde")]
pub lagging_wal_timeout: Option<Duration>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub trace_read_requests: Option<bool>,
}
impl TenantConfOpt {
@@ -140,9 +138,6 @@ impl TenantConfOpt {
.lagging_wal_timeout
.unwrap_or(global_conf.lagging_wal_timeout),
max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
trace_read_requests: self
.trace_read_requests
.unwrap_or(global_conf.trace_read_requests),
}
}
@@ -212,10 +207,10 @@ impl TenantConf {
.expect("cannot parse default walreceiver lagging wal timeout"),
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
.expect("cannot parse default max walreceiver Lsn wal lag"),
trace_read_requests: false,
}
}
#[cfg(test)]
pub fn dummy_conf() -> Self {
TenantConf {
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
@@ -237,7 +232,6 @@ impl TenantConf {
.unwrap(),
max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
.unwrap(),
trace_read_requests: false,
}
}
}

View File

@@ -12,7 +12,7 @@ use tracing::*;
use remote_storage::GenericRemoteStorage;
use crate::config::{PageServerConf, METADATA_FILE_NAME, TIMELINE_UNINIT_MARK_SUFFIX};
use crate::config::{PageServerConf, METADATA_FILE_NAME};
use crate::http::models::TenantInfo;
use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex};
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles};
@@ -24,7 +24,7 @@ use crate::tenant_config::TenantConfOpt;
use crate::walredo::PostgresRedoManager;
use crate::TEMP_FILE_SUFFIX;
use utils::crashsafe::{self, path_with_suffix_extension};
use utils::crashsafe_dir::{self, path_with_suffix_extension};
use utils::id::{TenantId, TimelineId};
mod tenants_state {
@@ -241,7 +241,7 @@ pub async fn shutdown_all_tenants() {
let tenant_id = tenant.tenant_id();
debug!("shutdown tenant {tenant_id}");
if let Err(err) = tenant.checkpoint().await {
if let Err(err) = tenant.checkpoint() {
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
}
}
@@ -265,98 +265,58 @@ fn create_tenant_files(
temporary_tenant_dir.display()
);
let temporary_tenant_timelines_dir = rebase_directory(
&conf.timelines_path(&tenant_id),
&target_tenant_directory,
&temporary_tenant_dir,
)?;
let temporary_tenant_config_path = rebase_directory(
&conf.tenant_config_path(tenant_id),
&target_tenant_directory,
&temporary_tenant_dir,
)?;
// top-level dir may exist if we are creating it through CLI
crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| {
crashsafe_dir::create_dir_all(&temporary_tenant_dir).with_context(|| {
format!(
"could not create temporary tenant directory {}",
temporary_tenant_dir.display()
)
})?;
let creation_result = try_create_target_tenant_dir(
conf,
tenant_conf,
tenant_id,
&temporary_tenant_dir,
&target_tenant_directory,
);
if creation_result.is_err() {
error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data");
if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) {
error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}")
} else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) {
error!(
"Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}"
)
}
}
creation_result
}
fn try_create_target_tenant_dir(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_id: TenantId,
temporary_tenant_dir: &Path,
target_tenant_directory: &Path,
) -> Result<(), anyhow::Error> {
let temporary_tenant_timelines_dir = rebase_directory(
&conf.timelines_path(&tenant_id),
target_tenant_directory,
temporary_tenant_dir,
)
.with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?;
let temporary_tenant_config_path = rebase_directory(
&conf.tenant_config_path(tenant_id),
target_tenant_directory,
temporary_tenant_dir,
)
.with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?;
Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context(
|| {
format!(
"Failed to write tenant {} config to {}",
tenant_id,
temporary_tenant_config_path.display()
)
},
)?;
crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
// first, create a config in the top-level temp directory, fsync the file
Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?;
// then, create a subdirectory in the top-level temp directory, fsynced
crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
format!(
"could not create tenant {} temporary timelines directory {}",
tenant_id,
"could not create temporary tenant timelines directory {}",
temporary_tenant_timelines_dir.display()
)
})?;
fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
});
fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| {
// move-rename tmp directory with all files synced into a permanent directory, fsync its parent
fs::rename(&temporary_tenant_dir, &target_tenant_directory).with_context(|| {
format!(
"failed to move tenant {} temporary directory {} into the permanent one {}",
tenant_id,
"failed to move temporary tenant directory {} into the permanent one {}",
temporary_tenant_dir.display(),
target_tenant_directory.display()
)
})?;
let target_dir_parent = target_tenant_directory.parent().with_context(|| {
format!(
"Failed to get tenant {} dir parent for {}",
tenant_id,
"Failed to get tenant dir parent for {}",
target_tenant_directory.display()
)
})?;
crashsafe::fsync(target_dir_parent).with_context(|| {
format!(
"Failed to fsync renamed directory's parent {} for tenant {}",
target_dir_parent.display(),
tenant_id,
)
})?;
fs::File::open(target_dir_parent)?.sync_all()?;
info!(
"created tenant directory structure in {}",
target_tenant_directory.display()
);
Ok(())
}
@@ -642,15 +602,6 @@ fn is_temporary(path: &Path) -> bool {
}
}
fn is_uninit_mark(path: &Path) -> bool {
match path.file_name() {
Some(name) => name
.to_string_lossy()
.ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
None => false,
}
}
fn collect_timelines_for_tenant(
config: &'static PageServerConf,
tenant_path: &Path,
@@ -693,74 +644,28 @@ fn collect_timelines_for_tenant(
e
);
}
} else if is_uninit_mark(&timeline_dir) {
let timeline_uninit_mark_file = &timeline_dir;
info!(
"Found an uninit mark file {}, removing the timeline and its uninit mark",
timeline_uninit_mark_file.display()
);
let timeline_id = timeline_uninit_mark_file
.file_stem()
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TimelineId>()
.with_context(|| {
format!(
"Could not parse timeline id out of the timeline uninit mark name {}",
timeline_uninit_mark_file.display()
)
})?;
let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
if let Err(e) =
remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
{
error!("Failed to clean up uninit marked timeline: {e:?}");
}
} else {
let timeline_id = timeline_dir
.file_name()
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TimelineId>()
.with_context(|| {
format!(
"Could not parse timeline id out of the timeline dir name {}",
timeline_dir.display()
)
})?;
let timeline_uninit_mark_file =
config.timeline_uninit_mark_file_path(tenant_id, timeline_id);
if timeline_uninit_mark_file.exists() {
info!("Found an uninit mark file for timeline {tenant_id}/{timeline_id}, removing the timeline and its uninit mark");
if let Err(e) = remove_timeline_and_uninit_mark(
&timeline_dir,
&timeline_uninit_mark_file,
) {
error!("Failed to clean up uninit marked timeline: {e:?}");
match collect_timeline_files(&timeline_dir) {
Ok((timeline_id, metadata, timeline_files)) => {
tenant_timelines.insert(
timeline_id,
TimelineLocalFiles::collected(metadata, timeline_files),
);
}
} else {
match collect_timeline_files(&timeline_dir) {
Ok((metadata, timeline_files)) => {
tenant_timelines.insert(
timeline_id,
TimelineLocalFiles::collected(metadata, timeline_files),
);
}
Err(e) => {
error!(
"Failed to process timeline dir contents at '{}', reason: {:?}",
timeline_dir.display(),
e
);
match remove_if_empty(&timeline_dir) {
Ok(true) => info!(
"Removed empty timeline directory {}",
timeline_dir.display()
),
Ok(false) => (),
Err(e) => {
error!("Failed to remove empty timeline directory: {e:?}")
}
Err(e) => {
error!(
"Failed to process timeline dir contents at '{}', reason: {:?}",
timeline_dir.display(),
e
);
match remove_if_empty(&timeline_dir) {
Ok(true) => info!(
"Removed empty timeline directory {}",
timeline_dir.display()
),
Ok(false) => (),
Err(e) => {
error!("Failed to remove empty timeline directory: {e:?}")
}
}
}
@@ -783,41 +688,24 @@ fn collect_timelines_for_tenant(
Ok((tenant_id, TenantAttachData::Ready(tenant_timelines)))
}
fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> {
fs::remove_dir_all(&timeline_dir)
.or_else(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
// we can leave the uninit mark without a timeline dir,
// just remove the mark then
Ok(())
} else {
Err(e)
}
})
.with_context(|| {
format!(
"Failed to remove unit marked timeline directory {}",
timeline_dir.display()
)
})?;
fs::remove_file(&uninit_mark).with_context(|| {
format!(
"Failed to remove timeline uninit mark file {}",
uninit_mark.display()
)
})?;
Ok(())
}
// discover timeline files and extract timeline metadata
// NOTE: ephemeral files are excluded from the list
fn collect_timeline_files(
timeline_dir: &Path,
) -> anyhow::Result<(TimelineMetadata, HashMap<PathBuf, LayerFileMetadata>)> {
) -> anyhow::Result<(
TimelineId,
TimelineMetadata,
HashMap<PathBuf, LayerFileMetadata>,
)> {
let mut timeline_files = HashMap::new();
let mut timeline_metadata_path = None;
let timeline_id = timeline_dir
.file_name()
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TimelineId>()
.context("Could not parse timeline id out of the timeline dir name")?;
let timeline_dir_entries =
fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
for entry in timeline_dir_entries {
@@ -866,5 +754,5 @@ fn collect_timeline_files(
"Timeline has no ancestor and no layer files"
);
Ok((metadata, timeline_files))
Ok((timeline_id, metadata, timeline_files))
}

View File

@@ -72,6 +72,8 @@ async fn compaction_loop(tenant_id: TenantId) {
if let Err(e) = tenant.compaction_iteration() {
sleep_duration = wait_duration;
error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
#[cfg(feature = "testing")]
std::process::abort();
}
// Sleep
@@ -117,10 +119,12 @@ async fn gc_loop(tenant_id: TenantId) {
let gc_horizon = tenant.get_gc_horizon();
let mut sleep_duration = gc_period;
if gc_horizon > 0 {
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false)
{
sleep_duration = wait_duration;
error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
#[cfg(feature = "testing")]
std::process::abort();
}
}
@@ -171,7 +175,7 @@ async fn wait_for_active_tenant(
}
state => {
debug!("Not running the task loop, tenant is not active with background jobs enabled: {state:?}");
continue;
tokio::time::sleep(wait).await;
}
}
}

Some files were not shown because too many files have changed in this diff Show More